Skip to content
Prev Previous commit
Next Next commit
multiclass oversampling
  • Loading branch information
Andrea Lorenzon committed Sep 15, 2020
commit 8bbbd2e9a408e736eee6c300c864405bfbb9d84e
36 changes: 23 additions & 13 deletions imblearn/over_sampling/_rose.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
from scipy import sparse

from sklearn.utils import check_random_state
from sklearn.utils.multiclass import type_of_target

from .base import BaseOverSampler

from ..utils import check_target_type
#from ..utils import check_target_type
from ..utils._validation import _deprecate_positional_args
from sklearn.utils.multiclass import type_of_target


class ROSE(BaseOverSampler):

Expand Down Expand Up @@ -58,7 +59,9 @@ def _make_samples(self,
n_class_samples,
h_shrink):
""" A support function that returns artificial samples constructed from
FIXME
a random subsample of the data, by adding a multiviariate gaussian kernel
and sampling from this distribution. An optional shrink factor can be
included, to compress/dilate the kernel.

Parameters
----------
Expand All @@ -83,18 +86,22 @@ def _make_samples(self,
Target values for synthetic samples.

"""
p = X.shape[1]

# get number of features
number_of_features = X.shape[1]
# import random state from API
random_state = check_random_state(self.random_state)
samples_indices = random_state.choice(
class_indices, size=n_class_samples, replace=True)

h_opt = (4 / ((p + 2) * len(class_indices))) ** (1 / (p + 4))
H_opt = h_shrink * h_opt * np.diagflat(
X[class_indices, :].std(axis=0, ddof=1))

# get random subsample of data with replacement
samples_indices = random_state.choice(class_indices, size=n_class_samples, replace=True)
# compute optimal min(AMISE)
minimize_amise = (4 / ((number_of_features + 2) * len(
class_indices))) ** (1 / (number_of_features + 4))
# create a diagonal matrix with the st.dev. of all classes
variances = np.diagflat(X[class_indices, :].std(axis=0, ddof=1))
# compute H_opt = optional shrink factors * min(AMISE) * kernel variance
h_opt = h_shrink * minimize_amise * variances
# (sample from multivariate normal)* h_opt + original values
Xrose = np.random.standard_normal(
size=(n_class_samples, p)) @ H_opt + X[samples_indices, :]
size=(n_class_samples, number_of_features)) @ h_opt + X[samples_indices, :]

return Xrose

Expand All @@ -112,8 +119,11 @@ def _fit_resample(self, X, y):
}

for class_sample, n_samples in self.sampling_strategy_.items():
# get indices of all y's with a given class n
class_indices = np.flatnonzero(y == class_sample)
# compute final n. of samples, by number of elements of a class + n_samples
n_class_samples = len(class_indices) + n_samples
# resample
X_new = self._make_samples(X,
class_indices,
n_class_samples,
Expand Down