scikit-learn-contrib · ThomasKluiters · May 5, 2019 · May 5, 2019 · May 5, 2019 · May 5, 2019
diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst
@@ -198,6 +198,15 @@ Therefore, it can be seen that the samples generated in the first and last
 columns are belonging to the same categories originally presented without any
 other extra interpolation.
 
+Furthermore, if the dataset solely consists of categorical features one may use the :class:`SMOTEN` class. This class generates samples in an identical fashion to :class:`SMOTENC` - however - only categorical features are permitted. Each feature is treated as a categorical feature and therefore it is not advised to use `SMOTEN` for datasets that contain both categorical and continious features::
+
+ >>> from imblearn.over_sampling import SMOTEN
+ >>> smote_n = SMOTEN(random_state=0)
+ >>> X[:, 1] = rng.randint(2, size=n_samples)
+ >>> X_resampled, y_resampled = smote_n.fit_resample(X, y)
+ >>> print(sorted(Counter(y_resampled).items()))
+ [(0, 30), (1, 30)]
+
 .. topic:: References
 
  .. [HWB2005] H. Han, W. Wen-Yuan, M. Bing-Huan, "Borderline-SMOTE: a new

diff --git a/doc/whats_new/v0.5.rst b/doc/whats_new/v0.5.rst
@@ -27,6 +27,9 @@ Enhancement
  and issue template showing how to print system and dependency information
  from the command line. :issue:`557` by :user:`Alexander L. Hayes <batflyer>`.
 
+- Add :class:`SMOTEN`. Add ability to use SMOTE on pure categorical features.
+ by :user:`Thomas Kluiters <ThomasKluiters`.
+
 Maintenance
 ...........
 

diff --git a/imblearn/over_sampling/__init__.py b/imblearn/over_sampling/__init__.py
@@ -9,6 +9,7 @@
 from ._smote import BorderlineSMOTE
 from ._smote import SVMSMOTE
 from ._smote import SMOTENC
+from ._smote import SMOTEN
 
 __all__ = ['ADASYN', 'RandomOverSampler',
- 'SMOTE', 'BorderlineSMOTE', 'SVMSMOTE', 'SMOTENC']
+ 'SMOTE', 'BorderlineSMOTE', 'SVMSMOTE', 'SMOTEN', 'SMOTENC']
diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py
@@ -950,11 +950,13 @@ class SMOTENC(SMOTE):
 
  """
 
- def __init__(self, categorical_features, sampling_strategy='auto',
+ def __init__(self, categorical_features, sampling_strategy='auto', kind='regular',
  random_state=None, k_neighbors=5, n_jobs=1):
  super(SMOTENC, self).__init__(sampling_strategy=sampling_strategy,
  random_state=random_state,
  k_neighbors=k_neighbors,
+ n_jobs=n_jobs,
+ kind=kind,
  ratio=None)
  self.categorical_features = categorical_features
 
@@ -986,6 +988,15 @@ def _fit_resample(self, X, y):
  self.n_features_ = X.shape[1]
  self._validate_estimator()
 
+ X_encoded = self._encode(X, y)
+
+ X_resampled, y_resampled = super(SMOTENC, self)._fit_resample(
+ X_encoded, y)
+ X_resampled = self._decode(X, X_resampled)
+
+ return X_resampled, y_resampled
+
+ def _encode(self, X, y):
  # compute the median of the standard deviation of the minority class
  target_stats = Counter(y)
  class_minority = min(target_stats, key=target_stats.get)
@@ -1015,18 +1026,15 @@ def _fit_resample(self, X, y):
  X_ohe = self.ohe_.fit_transform(
  X_categorical.toarray() if sparse.issparse(X_categorical)
  else X_categorical)
-
  # we can replace the 1 entries of the categorical features with the
  # median of the standard deviation. It will ensure that whenever
  # distance is computed between 2 samples, the difference will be equal
  # to the median of the standard deviation as in the original paper.
  X_ohe.data = (np.ones_like(X_ohe.data, dtype=X_ohe.dtype) *
  self.median_std_ / 2)
- X_encoded = sparse.hstack((X_continuous, X_ohe), format='csr')
-
- X_resampled, y_resampled = super(SMOTENC, self)._fit_resample(
- X_encoded, y)
+ return sparse.hstack((X_continuous, X_ohe), format='csr')
 
+ def _decode(self, X, X_resampled):
  # reverse the encoding of the categorical features
  X_res_cat = X_resampled[:, self.continuous_features_.size:]
  X_res_cat.data = np.ones_like(X_res_cat.data)
@@ -1055,8 +1063,7 @@ def _fit_resample(self, X, y):
  X_resampled.indices = col_indices
  else:
  X_resampled = X_resampled[:, indices_reordered]
-
- return X_resampled, y_resampled
+ return X_resampled
 
  def _generate_sample(self, X, nn_data, nn_num, row, col, step):
  """Generate a synthetic sample with an additional steps for the
@@ -1090,3 +1097,152 @@ def _generate_sample(self, X, nn_data, nn_num, row, col, step):
  sample[start_idx + col_sel] = 1
 
  return sparse.csr_matrix(sample) if sparse.issparse(X) else sample
+
+
+# @Substitution(
+# sampling_strategy=BaseOverSampler._sampling_strategy_docstring,
+# random_state=_random_state_docstring)
+class SMOTEN(SMOTENC):
+ """Synthetic Minority Over-sampling Technique for Nominal data
+ (SMOTE-N).
+
+ Unlike :class:`SMOTE`, SMOTE-N operates on datasets containing categorical
+ features.
+
+ Read more in the :ref:`User Guide <smote_adasyn>`.
+
+ Parameters
+ ----------
+ sampling_strategy : float, str, dict or callable, (default='auto')
+ Sampling information to resample the data set.
+
+ - When ``float``, it corresponds to the desired ratio of the number of
+ samples in the minority class over the number of samples in the
+ majority class after resampling. Therefore, the ratio is expressed as
+ :math:`\\alpha_{os} = N_{rm} / N_{M}` where :math:`N_{rm}` is the
+ number of samples in the minority class after resampling and
+ :math:`N_{M}` is the number of samples in the majority class.
+
+ .. warning::
+ ``float`` is only available for **binary** classification. An
+ error is raised for multi-class classification.
+
+ - When ``str``, specify the class targeted by the resampling. The
+ number of samples in the different classes will be equalized.
+ Possible choices are:
+
+ ``'minority'``: resample only the minority class;
+
+ ``'not minority'``: resample all classes but the minority class;
+
+ ``'not majority'``: resample all classes but the majority class;
+
+ ``'all'``: resample all classes;
+
+ ``'auto'``: equivalent to ``'not majority'``.
+
+ - When ``dict``, the keys correspond to the targeted classes. The
+ values correspond to the desired number of samples for each targeted
+ class.
+
+ - When callable, function taking ``y`` and returns a ``dict``. The keys
+ correspond to the targeted classes. The values correspond to the
+ desired number of samples for each class.
+
+ random_state : int, RandomState instance or None, optional (default=None)
+ Control the randomization of the algorithm.
+
+ - If int, ``random_state`` is the seed used by the random number
+ generator;
+ - If ``RandomState`` instance, random_state is the random number
+ generator;
+ - If ``None``, the random number generator is the ``RandomState``
+ instance used by ``np.random``.
+
+ k_neighbors : int or object, optional (default=5)
+ If ``int``, number of nearest neighbours to used to construct synthetic
+ samples. If object, an estimator that inherits from
+ :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to
+ find the k_neighbors.
+
+ n_jobs : int, optional (default=1)
+ The number of threads to open if possible.
+
+ Notes
+ -----
+ See the original paper [1]_ for more details.
+
+ Supports mutli-class resampling. A one-vs.-rest scheme is used as
+ originally proposed in [1]_.
+
+ See
+ :ref:`sphx_glr_auto_examples_over-sampling_plot_comparison_over_sampling.py`,
+ and :ref:`sphx_glr_auto_examples_over-sampling_plot_smote.py`.
+
+ See also
+ --------
+ SMOTE : Over-sample using SMOTE.
+
+ SVMSMOTE : Over-sample using SVM-SMOTE variant.
+
+ BorderlineSMOTE : Over-sample using Borderline-SMOTE variant.
+
+ ADASYN : Over-sample using ADASYN.
+
+ References
+ ----------
+ .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE:
+ synthetic minority over-sampling technique," Journal of artificial
+ intelligence research, 321-357, 2002.
+
+ Examples
+ --------
+
+ >>> from collections import Counter
+ >>> from numpy.random import RandomState
+ >>> from sklearn.datasets import make_classification
+ >>> from imblearn.over_sampling import SMOTEN
+ >>> X, y = make_classification(n_classes=2, class_sep=2,
+ ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
+ ... n_features=5, n_clusters_per_class=1, n_samples=1000, random_state=10)
+ >>> print('Original dataset shape (%s, %s)' % X.shape)
+ Original dataset shape (1000, 5)
+ >>> print('Original dataset samples in class 0: {}'.format(sum(y == 0)))
+ Original dataset samples in class 0: 100
+ >>> X[:, ] = RandomState(10).randint(0, 4, size=(1000, 5))
+ >>> sm = SMOTEN(random_state=42)
+ >>> X_res, y_res = sm.fit_resample(X, y)
+ >>> print('Resampled dataset samples in class 0: {}'.format(sum(y_res == 0)))
+ Resampled dataset samples in class 0: 900
+
+ """
+
+ def __init__(self, sampling_strategy='auto', kind='regular',
+ random_state=None, k_neighbors=5, n_jobs=1):
+ super(SMOTEN, self).__init__(categorical_features=[],
+ sampling_strategy=sampling_strategy,
+ random_state=random_state,
+ k_neighbors=k_neighbors,
+ n_jobs=n_jobs,
+ kind=kind)
+
+ def _validate_estimator(self):
+ self.categorical_features = np.asarray(range(self.n_features_))
+ self.continuous_features_ = np.asarray([])
+ super(SMOTEN, self)._validate_estimator()
+
+ def _decode(self, X, X_resampled):
+ X_unstacked = self.ohe_.inverse_transform(X_resampled)
+ if sparse.issparse(X):
+ X_unstacked = sparse.csr_matrix(X_unstacked)
+ return X_unstacked
+
+ def _encode(self, X, y):
+ self.ohe_ = OneHotEncoder(sparse=True, handle_unknown='ignore',
+ dtype=np.float64)
+ # the input of the OneHotEncoder needs to be dense
+ return self.ohe_.fit_transform(
+ X.toarray() if sparse.issparse(X)
+ else X)
+
+