99import math
1010import numbers
1111import warnings
12- from collections import Counter
1312
1413import numpy as np
1514from scipy import sparse
2322 check_random_state ,
2423)
2524from sklearn .utils .sparsefuncs_fast import (
26- csc_mean_variance_axis0 ,
2725 csr_mean_variance_axis0 ,
2826)
2927from sklearn .utils .validation import _num_features
@@ -116,11 +114,11 @@ def _make_samples(
116114 rows = np .floor_divide (samples_indices , nn_num .shape [1 ])
117115 cols = np .mod (samples_indices , nn_num .shape [1 ])
118116
119- X_new = self ._generate_samples (X , nn_data , nn_num , rows , cols , steps )
117+ X_new = self ._generate_samples (X , nn_data , nn_num , rows , cols , steps , y_type )
120118 y_new = np .full (n_samples , fill_value = y_type , dtype = y_dtype )
121119 return X_new , y_new
122120
123- def _generate_samples (self , X , nn_data , nn_num , rows , cols , steps ):
121+ def _generate_samples (self , X , nn_data , nn_num , rows , cols , steps , y_type = None ):
124122 r"""Generate a synthetic sample.
125123
126124 The rule for the generation is:
@@ -155,6 +153,9 @@ def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps):
155153 steps : ndarray of shape (n_samples,), dtype=float
156154 Step sizes for new samples.
157155
156+ y_type : None
157+ Unused parameter. Only for compatibility reason with SMOTE-NC.
158+
158159 Returns
159160 -------
160161 X_new : {ndarray, sparse matrix} of shape (n_samples, n_features)
@@ -465,8 +466,9 @@ class SMOTENC(SMOTE):
465466 continuous_features_ : ndarray of shape (n_cont_features,), dtype=np.int64
466467 Indices of the continuous features.
467468
468- median_std_ : float
469- Median of the standard deviation of the continuous features.
469+ median_std_ : dict of int -> float
470+ Median of the standard deviation of the continuous features for each
471+ class to be over-sampled.
470472
471473 n_features_ : int
472474 Number of features observed at `fit`.
@@ -627,23 +629,8 @@ def _fit_resample(self, X, y):
627629 self ._validate_column_types (X )
628630 self ._validate_estimator ()
629631
630- # compute the median of the standard deviation of the minority class
631- target_stats = Counter (y )
632- class_minority = min (target_stats , key = target_stats .get )
633-
634632 X_continuous = _safe_indexing (X , self .continuous_features_ , axis = 1 )
635633 X_continuous = check_array (X_continuous , accept_sparse = ["csr" , "csc" ])
636- X_minority = _safe_indexing (X_continuous , np .flatnonzero (y == class_minority ))
637-
638- if sparse .issparse (X ):
639- if X .format == "csr" :
640- _ , var = csr_mean_variance_axis0 (X_minority )
641- else :
642- _ , var = csc_mean_variance_axis0 (X_minority )
643- else :
644- var = X_minority .var (axis = 0 )
645- self .median_std_ = np .median (np .sqrt (var ))
646-
647634 X_categorical = _safe_indexing (X , self .categorical_features_ , axis = 1 )
648635 if X_continuous .dtype .name != "object" :
649636 dtype_ohe = X_continuous .dtype
@@ -664,28 +651,54 @@ def _fit_resample(self, X, y):
664651 if not sparse .issparse (X_ohe ):
665652 X_ohe = sparse .csr_matrix (X_ohe , dtype = dtype_ohe )
666653
667- # we can replace the 1 entries of the categorical features with the
668- # median of the standard deviation. It will ensure that whenever
669- # distance is computed between 2 samples, the difference will be equal
670- # to the median of the standard deviation as in the original paper.
671-
672- # In the edge case where the median of the std is equal to 0, the 1s
673- # entries will be also nullified. In this case, we store the original
674- # categorical encoding which will be later used for inverting the OHE
675- if math .isclose (self .median_std_ , 0 ):
676- self ._X_categorical_minority_encoded = _safe_indexing (
677- X_ohe .toarray (), np .flatnonzero (y == class_minority )
654+ X_encoded = sparse .hstack ((X_continuous , X_ohe ), format = "csr" , dtype = dtype_ohe )
655+ X_resampled = [X_encoded .copy ()]
656+ y_resampled = [y .copy ()]
657+
658+ # SMOTE resampling starts here
659+ self .median_std_ = {}
660+ for class_sample , n_samples in self .sampling_strategy_ .items ():
661+ if n_samples == 0 :
662+ continue
663+ target_class_indices = np .flatnonzero (y == class_sample )
664+ X_class = _safe_indexing (X_encoded , target_class_indices )
665+
666+ _ , var = csr_mean_variance_axis0 (
667+ X_class [:, : self .continuous_features_ .size ]
678668 )
669+ self .median_std_ [class_sample ] = np .median (np .sqrt (var ))
670+
671+ # In the edge case where the median of the std is equal to 0, the 1s
672+ # entries will be also nullified. In this case, we store the original
673+ # categorical encoding which will be later used for inverting the OHE
674+ if math .isclose (self .median_std_ [class_sample ], 0 ):
675+ # This variable will be used when generating data
676+ self ._X_categorical_minority_encoded = X_class [
677+ :, self .continuous_features_ .size :
678+ ].toarray ()
679+
680+ # we can replace the 1 entries of the categorical features with the
681+ # median of the standard deviation. It will ensure that whenever
682+ # distance is computed between 2 samples, the difference will be equal
683+ # to the median of the standard deviation as in the original paper.
684+ X_class_categorical = X_class [:, self .continuous_features_ .size :]
685+ # With one-hot encoding, the median will be repeated twice. We need
686+ # to divide by sqrt(2) such that we only have one median value
687+ # contributing to the Euclidean distance
688+ X_class_categorical .data [:] = self .median_std_ [class_sample ] / np .sqrt (2 )
689+ X_class [:, self .continuous_features_ .size :] = X_class_categorical
679690
680- # With one-hot encoding, the median will be repeated twice. We need to divide
681- # by sqrt(2) such that we only have one median value contributing to the
682- # Euclidean distance
683- X_ohe . data = (
684- np . ones_like ( X_ohe . data , dtype = X_ohe . dtype ) * self . median_std_ / np . sqrt ( 2 )
685- )
686- X_encoded = sparse . hstack (( X_continuous , X_ohe ), format = "csr" )
691+ self . nn_k_ . fit ( X_class )
692+ nns = self . nn_k_ . kneighbors ( X_class , return_distance = False )[:, 1 :]
693+ X_new , y_new = self . _make_samples (
694+ X_class , y . dtype , class_sample , X_class , nns , n_samples , 1.0
695+ )
696+ X_resampled . append ( X_new )
697+ y_resampled . append ( y_new )
687698
688- X_resampled , y_resampled = super ()._fit_resample (X_encoded , y )
699+ X_resampled = sparse .vstack (X_resampled , format = X_encoded .format )
700+ y_resampled = np .hstack (y_resampled )
701+ # SMOTE resampling ends here
689702
690703 # reverse the encoding of the categorical features
691704 X_res_cat = X_resampled [:, self .continuous_features_ .size :]
@@ -723,7 +736,7 @@ def _fit_resample(self, X, y):
723736
724737 return X_resampled , y_resampled
725738
726- def _generate_samples (self , X , nn_data , nn_num , rows , cols , steps ):
739+ def _generate_samples (self , X , nn_data , nn_num , rows , cols , steps , y_type ):
727740 """Generate a synthetic sample with an additional steps for the
728741 categorical features.
729742
@@ -741,7 +754,7 @@ def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps):
741754
742755 # In the case that the median std was equal to zeros, we have to
743756 # create non-null entry based on the encoded of OHE
744- if math .isclose (self .median_std_ , 0 ):
757+ if math .isclose (self .median_std_ [ y_type ] , 0 ):
745758 nn_data [
746759 :, self .continuous_features_ .size :
747760 ] = self ._X_categorical_minority_encoded
0 commit comments