Skip to content
9 changes: 9 additions & 0 deletions doc/over_sampling.rst
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,15 @@ Therefore, it can be seen that the samples generated in the first and last
columns are belonging to the same categories originally presented without any
other extra interpolation.

Furthermore, if the dataset solely consists of categorical features one may use the :class:`SMOTEN` class. This class generates samples in an identical fashion to :class:`SMOTENC` - however - only categorical features are permitted. Each feature is treated as a categorical feature and therefore it is not advised to use `SMOTEN` for datasets that contain both categorical and continious features::

>>> from imblearn.over_sampling import SMOTEN
>>> smote_n = SMOTEN(random_state=0)
>>> X[:, 1] = rng.randint(2, size=n_samples)
>>> X_resampled, y_resampled = smote_n.fit_resample(X, y)
>>> print(sorted(Counter(y_resampled).items()))
[(0, 30), (1, 30)]

.. topic:: References

.. [HWB2005] H. Han, W. Wen-Yuan, M. Bing-Huan, "Borderline-SMOTE: a new
Expand Down
3 changes: 3 additions & 0 deletions doc/whats_new/v0.5.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ Enhancement
and issue template showing how to print system and dependency information
from the command line. :issue:`557` by :user:`Alexander L. Hayes <batflyer>`.

- Add :class:`SMOTEN`. Add ability to use SMOTE on pure categorical features.
by :user:`Thomas Kluiters <ThomasKluiters`.

Maintenance
...........

Expand Down
3 changes: 2 additions & 1 deletion imblearn/over_sampling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from ._smote import BorderlineSMOTE
from ._smote import SVMSMOTE
from ._smote import SMOTENC
from ._smote import SMOTEN

__all__ = ['ADASYN', 'RandomOverSampler',
'SMOTE', 'BorderlineSMOTE', 'SVMSMOTE', 'SMOTENC']
'SMOTE', 'BorderlineSMOTE', 'SVMSMOTE', 'SMOTEN', 'SMOTENC']
172 changes: 164 additions & 8 deletions imblearn/over_sampling/_smote.py
Original file line number Diff line number Diff line change
Expand Up @@ -950,11 +950,13 @@ class SMOTENC(SMOTE):

"""

def __init__(self, categorical_features, sampling_strategy='auto',
def __init__(self, categorical_features, sampling_strategy='auto', kind='regular',
random_state=None, k_neighbors=5, n_jobs=1):
super(SMOTENC, self).__init__(sampling_strategy=sampling_strategy,
random_state=random_state,
k_neighbors=k_neighbors,
n_jobs=n_jobs,
kind=kind,
ratio=None)
self.categorical_features = categorical_features

Expand Down Expand Up @@ -986,6 +988,15 @@ def _fit_resample(self, X, y):
self.n_features_ = X.shape[1]
self._validate_estimator()

X_encoded = self._encode(X, y)

X_resampled, y_resampled = super(SMOTENC, self)._fit_resample(
X_encoded, y)
X_resampled = self._decode(X, X_resampled)

return X_resampled, y_resampled

def _encode(self, X, y):
# compute the median of the standard deviation of the minority class
target_stats = Counter(y)
class_minority = min(target_stats, key=target_stats.get)
Expand Down Expand Up @@ -1015,18 +1026,15 @@ def _fit_resample(self, X, y):
X_ohe = self.ohe_.fit_transform(
X_categorical.toarray() if sparse.issparse(X_categorical)
else X_categorical)

# we can replace the 1 entries of the categorical features with the
# median of the standard deviation. It will ensure that whenever
# distance is computed between 2 samples, the difference will be equal
# to the median of the standard deviation as in the original paper.
X_ohe.data = (np.ones_like(X_ohe.data, dtype=X_ohe.dtype) *
self.median_std_ / 2)
X_encoded = sparse.hstack((X_continuous, X_ohe), format='csr')

X_resampled, y_resampled = super(SMOTENC, self)._fit_resample(
X_encoded, y)
return sparse.hstack((X_continuous, X_ohe), format='csr')

def _decode(self, X, X_resampled):
# reverse the encoding of the categorical features
X_res_cat = X_resampled[:, self.continuous_features_.size:]
X_res_cat.data = np.ones_like(X_res_cat.data)
Expand Down Expand Up @@ -1055,8 +1063,7 @@ def _fit_resample(self, X, y):
X_resampled.indices = col_indices
else:
X_resampled = X_resampled[:, indices_reordered]

return X_resampled, y_resampled
return X_resampled

def _generate_sample(self, X, nn_data, nn_num, row, col, step):
"""Generate a synthetic sample with an additional steps for the
Expand Down Expand Up @@ -1090,3 +1097,152 @@ def _generate_sample(self, X, nn_data, nn_num, row, col, step):
sample[start_idx + col_sel] = 1

return sparse.csr_matrix(sample) if sparse.issparse(X) else sample


# @Substitution(
# sampling_strategy=BaseOverSampler._sampling_strategy_docstring,
# random_state=_random_state_docstring)
class SMOTEN(SMOTENC):
"""Synthetic Minority Over-sampling Technique for Nominal data
(SMOTE-N).

Unlike :class:`SMOTE`, SMOTE-N operates on datasets containing categorical
features.

Read more in the :ref:`User Guide <smote_adasyn>`.

Parameters
----------
sampling_strategy : float, str, dict or callable, (default='auto')
Sampling information to resample the data set.

- When ``float``, it corresponds to the desired ratio of the number of
samples in the minority class over the number of samples in the
majority class after resampling. Therefore, the ratio is expressed as
:math:`\\alpha_{os} = N_{rm} / N_{M}` where :math:`N_{rm}` is the
number of samples in the minority class after resampling and
:math:`N_{M}` is the number of samples in the majority class.

.. warning::
``float`` is only available for **binary** classification. An
error is raised for multi-class classification.

- When ``str``, specify the class targeted by the resampling. The
number of samples in the different classes will be equalized.
Possible choices are:

``'minority'``: resample only the minority class;

``'not minority'``: resample all classes but the minority class;

``'not majority'``: resample all classes but the majority class;

``'all'``: resample all classes;

``'auto'``: equivalent to ``'not majority'``.

- When ``dict``, the keys correspond to the targeted classes. The
values correspond to the desired number of samples for each targeted
class.

- When callable, function taking ``y`` and returns a ``dict``. The keys
correspond to the targeted classes. The values correspond to the
desired number of samples for each class.

random_state : int, RandomState instance or None, optional (default=None)
Control the randomization of the algorithm.

- If int, ``random_state`` is the seed used by the random number
generator;
- If ``RandomState`` instance, random_state is the random number
generator;
- If ``None``, the random number generator is the ``RandomState``
instance used by ``np.random``.

k_neighbors : int or object, optional (default=5)
If ``int``, number of nearest neighbours to used to construct synthetic
samples. If object, an estimator that inherits from
:class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to
find the k_neighbors.

n_jobs : int, optional (default=1)
The number of threads to open if possible.

Notes
-----
See the original paper [1]_ for more details.

Supports mutli-class resampling. A one-vs.-rest scheme is used as
originally proposed in [1]_.

See
:ref:`sphx_glr_auto_examples_over-sampling_plot_comparison_over_sampling.py`,
and :ref:`sphx_glr_auto_examples_over-sampling_plot_smote.py`.

See also
--------
SMOTE : Over-sample using SMOTE.

SVMSMOTE : Over-sample using SVM-SMOTE variant.

BorderlineSMOTE : Over-sample using Borderline-SMOTE variant.

ADASYN : Over-sample using ADASYN.

References
----------
.. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE:
synthetic minority over-sampling technique," Journal of artificial
intelligence research, 321-357, 2002.

Examples
--------

>>> from collections import Counter
>>> from numpy.random import RandomState
>>> from sklearn.datasets import make_classification
>>> from imblearn.over_sampling import SMOTEN
>>> X, y = make_classification(n_classes=2, class_sep=2,
... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
... n_features=5, n_clusters_per_class=1, n_samples=1000, random_state=10)
>>> print('Original dataset shape (%s, %s)' % X.shape)
Original dataset shape (1000, 5)
>>> print('Original dataset samples in class 0: {}'.format(sum(y == 0)))
Original dataset samples in class 0: 100
>>> X[:, ] = RandomState(10).randint(0, 4, size=(1000, 5))
>>> sm = SMOTEN(random_state=42)
>>> X_res, y_res = sm.fit_resample(X, y)
>>> print('Resampled dataset samples in class 0: {}'.format(sum(y_res == 0)))
Resampled dataset samples in class 0: 900

"""

def __init__(self, sampling_strategy='auto', kind='regular',
random_state=None, k_neighbors=5, n_jobs=1):
super(SMOTEN, self).__init__(categorical_features=[],
sampling_strategy=sampling_strategy,
random_state=random_state,
k_neighbors=k_neighbors,
n_jobs=n_jobs,
kind=kind)

def _validate_estimator(self):
self.categorical_features = np.asarray(range(self.n_features_))
self.continuous_features_ = np.asarray([])
super(SMOTEN, self)._validate_estimator()

def _decode(self, X, X_resampled):
X_unstacked = self.ohe_.inverse_transform(X_resampled)
if sparse.issparse(X):
X_unstacked = sparse.csr_matrix(X_unstacked)
return X_unstacked

def _encode(self, X, y):
self.ohe_ = OneHotEncoder(sparse=True, handle_unknown='ignore',
dtype=np.float64)
# the input of the OneHotEncoder needs to be dense
return self.ohe_.fit_transform(
X.toarray() if sparse.issparse(X)
else X)


Loading