implemented ROSE, still some failed test

scikit-learn-contrib · andrealorenzon · Aug 21, 2020 · Aug 21, 2020 · Aug 21, 2020 · Aug 21, 2020
commit 0a3307b6c595d5662380382d1c7d39a73d4ac3ec
diff --git a/imblearn/over_sampling/_rose.py b/imblearn/over_sampling/_rose.py
@@ -1,7 +1,137 @@
+"""Class to perform over-sampling using ROSE."""
+
+import numpy as np
+from scipy import sparse
+
+from sklearn.utils import check_random_state
+from sklearn.utils import _safe_indexing
+
 from .base import BaseOverSampler
+from ..utils import check_neighbors_object
+from ..utils import Substitution
+from ..utils._docstring import _n_jobs_docstring
+from ..utils._docstring import _random_state_docstring
+from ..utils._validation import _deprecate_positional_args
+
 
 class ROSE(BaseOverSampler):
- pass
+
+ """Oversample using Random OverSampling Examples (ROSE) algorithm.
+
+ Read more in the :ref:`User Guide <rose>`.
+ Parameters
+ ----------
+ {sampling_strategy}
+ {random_state}
+ shrink_factors : dict of {classes: shrinkfactors} couples, applied to
+ the gaussian kernels
+ {n_jobs}
+
+ Notes
+ -----
+ TODO: Support for multi-class resampling. A one-vs.one scheme is used.
+ References
+ ----------
+ .. [1] N. Lunardon, G. Menardi, N.Torelli, "ROSE: A Package for Binary
+ Imbalanced Learning," R Journal, 6(1), 2014.
+
+ .. [2] G Menardi, N. Torelli, "Training and assessing classification
+ rules with imbalanced data," Data Mining and Knowledge
+ Discovery, 28(1), pp.92-122, 2014.
 class BorderlineSMOTE(BaseSMOTE): 
 """Over-sampling using Borderline SMOTE. 
  This algorithm is a variant of the original SMOTE algorithm proposed in 
  [2]_. Borderline samples will be detected and used to generate new 
  synthetic samples. 
 class BorderlineSMOTE(BaseSMOTE): 
 """Over-sampling using Borderline SMOTE. 
  
  This algorithm is a variant of the original SMOTE algorithm proposed in 
  [2]_. Borderline samples will be detected and used to generate new 
  synthetic samples. 
+
+ """
+
+
+ @_deprecate_positional_args
+ def __init__(
+ self,
+ *,
+ sampling_strategy="auto",
+ shrink_factors=None,
+ random_state=None,
+ n_jobs=None,
+ ):
+ super().__init__(sampling_strategy=sampling_strategy)
+ self.random_state = random_state
+ self.shrink_factors = shrink_factors
+ self.n_jobs = n_jobs
+
+ def _make_samples(self,
+ X,
+ class_indices,
+ n_class_samples,
+ h_shrink):
+ """ A support function that returns artificial samples constructed from
+ FIXME
+
+ Parameters
+ ----------
+ X : {array-like, sparse matrix}, shape (n_samples, n_features)
+ Observations from which the samples will be created.
+
+ class_indices : ndarray, shape (n_class_samples,)
+ The target class indices
+
+ n_class_samples : int
+ The total number of samples per class to generate
+
+ h_shrink : int
+ the shrink factor
+
+ Returns
+ -------
+ X_new : {ndarray, sparse matrix}, shape (n_samples, n_features)
+ Synthetically generated samples.
+
+ y_new : ndarray, shape (n_samples,)
+ Target values for synthetic samples.
+
+ """
+
+ # pdb.set_trace()
+
+ p = X.shape[1]
+
+ random_state = check_random_state(self.random_state)
+ samples_indices = random_state.choice(
+ class_indices, size=n_class_samples, replace=True)
+
+ h_opt = (4 / ((p + 2) * len(class_indices))) ** (1 / (p + 4))
+ H_opt = h_shrink * h_opt * np.diagflat(
+ X[class_indices, :].std(axis=0, ddof=1))
+
+ Xrose = np.random.standard_normal(
+ size=(n_class_samples, p)) @ H_opt + X[samples_indices, :]
+
+ return Xrose
 
  def _fit_resample(self, X, y):
- pass
+
+ random_state = check_random_state(self.random_state)
+
+ X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype)
+ y_resampled = np.empty((0),dtype=X.dtype)
+
+ if self.shrink_factors is None:
+ self.shrink_factors = {key: 1 for key in self.sampling_strategy_.keys()}
+
+
+ for class_sample, n_samples in self.sampling_strategy_.items():
+ class_indices = np.flatnonzero(y == class_sample)
+ n_class_samples = len(class_indices) + n_samples
+ X_new = self._make_samples(X,
+ class_indices,
+ n_class_samples,
+ self.shrink_factors[class_sample])
+ y_new = np.array([class_sample] * n_class_samples)
+
+ if sparse.issparse(X_new):
+ X_resampled = sparse.vstack([X_resampled, X_new])
+ else:
+ X_resampled = np.vstack((X_resampled, X_new))
+
+ y_resampled = np.hstack((y_resampled, y_new))
+
+ return X_resampled.astype(X.dtype), y_resampled.astype(y.dtype)
+
+
diff --git a/imblearn/over_sampling/tests/test_rose.py b/imblearn/over_sampling/tests/test_rose.py
@@ -1,4 +1,4 @@
-"""Test the module SMOTENC."""
+"""Test the module ROSE."""
 # Authors: Andrea Lorenzon <andrelorenzon@gmail.com>
 # License: MIT
 
@@ -19,3 +19,36 @@ def test_instance():
  rose = ROSE()
  assert(ROSE)
 
+RND_SEED = 0
+X = np.array(
+ [
+ [0.11622591, -0.0317206],
+ [0.77481731, 0.60935141],
+ [1.25192108, -0.22367336],
+ [0.53366841, -0.30312976],
+ [1.52091956, -0.49283504],
+ [-0.28162401, -2.10400981],
+ [0.83680821, 1.72827342],
+ [0.3084254, 0.33299982],
+ [0.70472253, -0.73309052],
+ [0.28893132, -0.38761769],
+ [1.15514042, 0.0129463],
+ [0.88407872, 0.35454207],
+ [1.31301027, -0.92648734],
+ [-1.11515198, -0.93689695],
+ [-0.18410027, -0.45194484],
+ [0.9281014, 0.53085498],
+ [-0.14374509, 0.27370049],
+ [-0.41635887, -0.38299653],
+ [0.08711622, 0.93259929],
+ [1.70580611, -0.11219234],
+ ]
+)
+Y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0])
+R_TOL = 1e-4
+
+
+def test_rose():
+ X_res, y_res = ROSE().fit_resample(X, Y)
+ assert(np.unique(Y.all())==np.unique(y_res.all()))
+ assert(X_res.shape[1]==X.shape[1])