scikit-learn-contrib · andrealorenzon · Aug 21, 2020 · Aug 21, 2020 · Aug 21, 2020 · Aug 21, 2020
diff --git a/README.rst b/README.rst
@@ -155,6 +155,7 @@ Below is a list of the methods currently implemented in this module.
  5. SVM SMOTE - Support Vectors SMOTE [10]_
  6. ADASYN - Adaptive synthetic sampling approach for imbalanced learning [15]_
  7. KMeans-SMOTE [17]_
+ 8. ROSE - Random OverSampling Examples [19]_
 
 * Over-sampling followed by under-sampling
  1. SMOTE + Tomek links [12]_
@@ -210,4 +211,6 @@ References:
 
 .. [17] : Felix Last, Georgios Douzas, Fernando Bacao, "Oversampling for Imbalanced Learning Based on K-Means and SMOTE"
 
-.. [18] : Seiffert, C., Khoshgoftaar, T. M., Van Hulse, J., & Napolitano, A. "RUSBoost: A hybrid approach to alleviating class imbalance." IEEE Transactions on Systems, Man, and Cybernetics-Part A: Systems and Humans 40.1 (2010): 185-197.
+.. [18] : Seiffert, C., Khoshgoftaar, T. M., Van Hulse, J., & Napolitano, A. "RUSBoost: A hybrid approach to alleviating class imbalance." IEEE Transactions on Systems, Man, and Cybernetics-Part A: Systems and Humans 40.1 (2010): 185-197.
+
+.. [19] : Menardi, G., Torelli, N.: "Training and assessing classification rules with unbalanced data", Data Mining and Knowledge Discovery, 28, (2014): 92–122
diff --git a/doc/bibtex/refs.bib b/doc/bibtex/refs.bib
@@ -193,3 +193,13 @@ @article{smith2014instance
  year={2014},
  publisher={Springer}
 }
+
+@article{torelli2014rose,
+ title={Training and assessing classification rules with imbalanced data},
+ author={Menardi G and Torelli N},
+ journal={Data Mining and Knowledge Discovery},
+ volume={28},
+ pages={92-122},
+ year={2014},
+ publisher={Springer}
+}
diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst
@@ -198,6 +198,26 @@ Therefore, it can be seen that the samples generated in the first and last
 columns are belonging to the same categories originally presented without any
 other extra interpolation.
 
+Random OverSampling Examples (ROSE)
+-----------------------------------
+
+ROSE is an oversampling method, provided by the :class:`ROSE` class, that balances binary
+response datasets by a smoothed bootstrap resampling technique. New examples are 
+generated in the neighborhood of existing samples from a smooth, unimodal and symmetric 
+distribution. The generation of the samples corresponds to the generation of the data
+from the kernel density estimate (KDE) :math:`f(x \vert \mathcal Y_i)`
+
+Article: :cite:`torelli2014rose`. 
+
+The class can be used in the following manner::
+
+ >>> from imblearn.over_sampling import ROSE
+ >>> X_resampled, y_resampled = ROSE().fit_resample(X, y)
+ >>> print(sorted(Counter(y_resampled).items()))
+ [(0, 4674), (1, 4674), (2, 4674)]
+ >>> clf_rose = LinearSVC().fit(X_resampled, y_resampled)
+
+
 Mathematical formulation
 ========================
 
@@ -273,7 +293,8 @@ Multi-class management
 ----------------------
 
 All algorithms can be used with multiple classes as well as binary classes
-classification. :class:`RandomOverSampler` does not require any inter-class
+classification aside from ROSE, that only works on binary classification problems.
+:class:`RandomOverSampler` does not require any inter-class
 information during the sample generation. Therefore, each targeted class is
 resampled independently. In the contrary, both :class:`ADASYN` and
 :class:`SMOTE` need information regarding the neighbourhood of each sample used

diff --git a/imblearn/over_sampling/__init__.py b/imblearn/over_sampling/__init__.py
@@ -10,6 +10,7 @@
 from ._smote import KMeansSMOTE
 from ._smote import SVMSMOTE
 from ._smote import SMOTENC
+from ._rose import ROSE
 
 __all__ = [
  "ADASYN",
@@ -19,4 +20,5 @@
  "BorderlineSMOTE",
  "SVMSMOTE",
  "SMOTENC",
+ "ROSE"
 ]
diff --git a/imblearn/over_sampling/_rose.py b/imblearn/over_sampling/_rose.py
@@ -0,0 +1,139 @@
+"""Class to perform over-sampling using ROSE."""
+
+import numpy as np
+from scipy import sparse
+from sklearn.utils import check_random_state
+from .base import BaseOverSampler
+from ..utils._validation import _deprecate_positional_args
+# from sklearn.utils import check_X_y
+
+
+class ROSE(BaseOverSampler):
+
+ """Oversample using Random OverSampling Examples (ROSE) algorithm.
+
+ The algorithm generates new samples by a smoothed bootstrap approach.
+ The generation of new examples corresponds to the generation of data from
+ the kernel density estimate of f(x|Y_i), with a smoothing matrix H_j.
+ A shrinking matrix can be provided, to set the bandwidth of the gaussian
+ kernel.
+
+ Read more in the :ref:`User Guide <rose>`.
+
+ Parameters
+ ----------
+ {sampling_strategy}
+ {random_state}
+ shrink_factors : dict of {classes: shrinkfactors} couples, applied to
+ the gaussian kernels
+ {n_jobs}
+
+ Notes
+ -----
+ TODO: Support for multi-class resampling. A one-vs.one scheme is used.
+ References
+ ----------
+ .. [1] N. Lunardon, G. Menardi, N.Torelli, "ROSE: A Package for Binary
+ Imbalanced Learning," R Journal, 6(1), 2014.
+
+ .. [2] G Menardi, N. Torelli, "Training and assessing classification
+ rules with imbalanced data," Data Mining and Knowledge
+ Discovery, 28(1), pp.92-122, 2014.
 class BorderlineSMOTE(BaseSMOTE): 
 """Over-sampling using Borderline SMOTE. 
  This algorithm is a variant of the original SMOTE algorithm proposed in 
  [2]_. Borderline samples will be detected and used to generate new 
  synthetic samples. 
 class BorderlineSMOTE(BaseSMOTE): 
 """Over-sampling using Borderline SMOTE. 
  
  This algorithm is a variant of the original SMOTE algorithm proposed in 
  [2]_. Borderline samples will be detected and used to generate new 
  synthetic samples. 
+ """
+
+ @_deprecate_positional_args
+ def __init__(self, *, sampling_strategy="auto", shrink_factors=None,
+ random_state=None, n_jobs=None):
+ super().__init__(sampling_strategy=sampling_strategy)
+ self.random_state = random_state
+ self.shrink_factors = shrink_factors
+ self.n_jobs = n_jobs
+
+ def _make_samples(self,
+ X,
+ class_indices,
+ n_class_samples,
+ h_shrink):
+ """ A support function that returns artificial samples constructed
+ from a random subsample of the data, by adding a multiviariate
+ gaussian kernel and sampling from this distribution. An optional
+ shrink factor can be included, to compress/dilate the kernel.
+
+ Parameters
+ ----------
+ X : {array-like, sparse matrix}, shape (n_samples, n_features)
+ Observations from which the samples will be created.
+
+ class_indices : ndarray, shape (n_class_samples,)
+ The target class indices
+
+ n_class_samples : int
+ The total number of samples per class to generate
+
+ h_shrink : int
+ the shrink factor
+
+ Returns
+ -------
+ X_new : {ndarray, sparse matrix}, shape (n_samples, n_features)
+ Synthetically generated samples.
+
+ y_new : ndarray, shape (n_samples,)
+ Target values for synthetic samples.
+
+ """
+ # get number of features
+ number_of_features = X.shape[1]
+ # import random state from API
+ random_state = check_random_state(self.random_state)
+ # get random subsample of data with replacement
+ samples_indices = random_state.choice(
+ class_indices, size=n_class_samples, replace=True)
+ # compute optimal min(AMISE)
+ minimize_amise = (4 / ((number_of_features + 2) * len(
+ class_indices))) ** (1 / (number_of_features + 4))
+ # create a diagonal matrix with the st.dev. of all classes
+ variances = np.std(np.diagflat(X[class_indices, :]),
+ axis=0,
+ ddof=1)
+ # compute H_optimal
+ h_opt = h_shrink * minimize_amise * variances
+ # (sample from multivariate normal)* h_opt + original values
+ Xrose = np.random.standard_normal(
+ size=(n_class_samples,
+ number_of_features)) @ h_opt + X[samples_indices, :]
+
+ return Xrose
+
+ def _fit_resample(self, X, y):
+
+ # X, y = check_X_y(X, y)
+ X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype)
+ y_resampled = np.empty((0), dtype=X.dtype)
+
+ if self.shrink_factors is None:
+ self.shrink_factors = {
+ key: 0.5 for key in self.sampling_strategy_.keys()
+ }
+
+ for class_sample, n_samples in self.sampling_strategy_.items():
+ # get indices of all y's with a given class n
+ class_indices = np.flatnonzero(y == class_sample)
+ # compute final n. of samples, by n. of elements + n_samples
+ n_class_samples = len(class_indices) + n_samples
+
+ # resample
+ X_new = self._make_samples(X,
+ class_indices,
+ n_class_samples,
+ self.shrink_factors[class_sample])
+ y_new = np.array([class_sample] * n_class_samples)
+
+ if sparse.issparse(X_new):
+ X_resampled = sparse.vstack([X_resampled, X_new])
+ else:
+ X_resampled = np.vstack((X_resampled, X_new))
+
+ y_resampled = np.hstack((y_resampled, y_new))
+
+ return X_resampled.astype(X.dtype), y_resampled.astype(y.dtype)
diff --git a/imblearn/over_sampling/tests/test_rose.py b/imblearn/over_sampling/tests/test_rose.py
@@ -0,0 +1,26 @@
+"""Test the module ROSE."""
+# Authors: Andrea Lorenzon <andrelorenzon@gmail.com>
+# License: MIT
+
+import numpy as np
+
+from imblearn.over_sampling import ROSE
+
+
+def test_rose():
+
+ """Check ROSE use"""
+
+ RND_SEED = 0
+
+ X = np.array([[1., 1., 1., 0],
+ [2., 2., 2., 0],
+ [3., 3., 3., 0],
+ [0.9, 0.9, 0.9, 0],
+ [1.8, 1.8, 1.8, 0],
+ [2.7, 2.7, 2.7, 0],
+ [1.1, 1.1, 1.1, 0],
+ [2.2, 2.2, 2.2, 0],
+ [3.3, 3.3, 3.3, 0]])
+ Y = np.array([1, 2, 3, 1, 2, 3, 1, 2, 3])
+ X_res, y_res = ROSE(random_state=RND_SEED).fit_resample(X, Y)