scikit-learn-contrib · glemaitre · Oct 12, 2018 · Mar 2, 2018 · Sep 4, 2018 · Sep 4, 2018
diff --git a/doc/api.rst b/doc/api.rst
@@ -72,6 +72,7 @@ Prototype selection
  over_sampling.ADASYN
  over_sampling.RandomOverSampler
  over_sampling.SMOTE
+ over_sampling.SMOTENC
 
 
 .. _combine_ref:

diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst
@@ -160,6 +160,44 @@ some variant of the SMOTE algorithm::
  >>> print(sorted(Counter(y_resampled).items()))
  [(0, 4674), (1, 4674), (2, 4674)]
 
+When dealing with mixed data type such as continuous and categorical features,
+none of the presented methods (apart of the class :class:`RandomOverSampler`)
+can deal with the categorical features. The :class:`SMOTENC` [CBHK2002]_ is an
+extension of the :class:`SMOTE` algorithm for which categorical data are
+treated differently::
+
+ >>> # create a synthetic data set with continuous and categorical features
+ >>> rng = np.random.RandomState(42)
+ >>> n_samples = 50
+ >>> X = np.empty((n_samples, 3), dtype=object)
+ >>> X[:, 0] = rng.choice(['A', 'B', 'C'], size=n_samples).astype(object)
+ >>> X[:, 1] = rng.randn(n_samples)
+ >>> X[:, 2] = rng.randint(3, size=n_samples)
+ >>> y = np.array([0] * 20 + [1] * 30)
+ >>> print(sorted(Counter(y).items()))
+ [(0, 20), (1, 30)]
+
+In this data set, the first and last features are considered as categorical
+features. One need to provide this information to :class:`SMOTENC` via the
+parameters ``categorical_features`` either by passing the indices of these
+features or a boolean mask marking these features::
+
+ >>> from imblearn.over_sampling import SMOTENC
+ >>> smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0)
+ >>> X_resampled, y_resampled = smote_nc.fit_resample(X, y)
+ >>> print(sorted(Counter(y_resampled).items()))
+ [(0, 30), (1, 30)]
+ >>> print(X_resampled[-5:])
+ [['A' 0.1989993778979113 2]
+ ['B' -0.3657680728116921 1]
+ ['A' 0.8790828729585258 2]
+ ['B' 0.3710891618824609 2]
+ ['B' 0.3327240726719727 0]]
+
+Therefore, it can be seen that the samples generated in the first and last
+columns are belonging to the same categories originally presented without any
+other extra interpolation.
+
 .. topic:: References
 
  .. [HWB2005] H. Han, W. Wen-Yuan, M. Bing-Huan, "Borderline-SMOTE: a new
@@ -198,8 +236,13 @@ interpolation will create a sample on the line between :math:`x_{i}` and
  :scale: 60
  :align: center
 
-Each SMOTE variant and ADASYN differ from each other by selecting the samples
-:math:`x_i` ahead of generating the new samples.
+SMOTE-NC slightly change the way a new sample is generated by performing
+something specific for the categorical features. In fact, the categories of a
+new generated sample are decided by picking the most frequent category of the
+nearest neighbors present during the generation.
+
+The other SMOTE variants and ADASYN differ from each other by selecting the
+samples :math:`x_i` ahead of generating the new samples.
 
 The **regular** SMOTE algorithm --- cf. to the :class:`SMOTE` object --- does not
 impose any rule and will randomly pick-up all possible :math:`x_i` available.

diff --git a/doc/whats_new/v0.0.4.rst b/doc/whats_new/v0.0.4.rst
@@ -41,6 +41,12 @@ New features
  under-sampling stage before each boosting iteration of AdaBoost.
  :issue:`469` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+- Add :class:`imblern.over_sampling.SMOTENC` which generate synthetic samples
+ on data set with heterogeneous data type (continuous and categorical
+ features).
+ :issue:`412` by :user:`Denis Dudnik <ddudnik>` and
+ :user:`Guillaume Lemaitre <glemaitre>`.
+
 Enhancement
 ...........
 

diff --git a/examples/over-sampling/plot_comparison_over_sampling.py b/examples/over-sampling/plot_comparison_over_sampling.py
@@ -21,7 +21,7 @@
 
 from imblearn.pipeline import make_pipeline
 from imblearn.over_sampling import ADASYN
-from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE
+from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE, SMOTENC
 from imblearn.over_sampling import RandomOverSampler
 from imblearn.base import BaseSampler
 
@@ -226,4 +226,29 @@ def _fit_resample(self, X, y):
  ax[1].set_title('Resampling using {}'.format(sampler.__class__.__name__))
 fig.tight_layout()
 
+###############################################################################
+# When dealing with a mixed of continuous and categorical features, SMOTE-NC
+# is the only method which can handle this case.
+
+# create a synthetic data set with continuous and categorical features
+rng = np.random.RandomState(42)
+n_samples = 50
+X = np.empty((n_samples, 3), dtype=object)
+X[:, 0] = rng.choice(['A', 'B', 'C'], size=n_samples).astype(object)
+X[:, 1] = rng.randn(n_samples)
+X[:, 2] = rng.randint(3, size=n_samples)
+y = np.array([0] * 20 + [1] * 30)
+
+print('The original imbalanced dataset')
+print(sorted(Counter(y).items()))
+print('The first and last columns are containing categorical features:')
+print(X[:5])
+
+smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0)
+X_resampled, y_resampled = smote_nc.fit_resample(X, y)
+print('Dataset after resampling:')
+print(sorted(Counter(y_resampled).items()))
+print('SMOTE-NC will generate categories for the categorical features:')
+print(X_resampled[-5:])
+
 plt.show()
diff --git a/imblearn/over_sampling/__init__.py b/imblearn/over_sampling/__init__.py
@@ -8,6 +8,7 @@
 from ._smote import SMOTE
 from ._smote import BorderlineSMOTE
 from ._smote import SVMSMOTE
+from ._smote import SMOTENC
 
 __all__ = ['ADASYN', 'RandomOverSampler',
- 'SMOTE', 'BorderlineSMOTE', 'SVMSMOTE']
+ 'SMOTE', 'BorderlineSMOTE', 'SVMSMOTE', 'SMOTENC']