Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
FIX depreacte ratio and ratio_
  • Loading branch information
glemaitre committed Mar 26, 2018
commit f3fef5cb0646f798afd6dbd83b0a57f1fcaeb22b
38 changes: 33 additions & 5 deletions imblearn/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from __future__ import division

import logging
import warnings
from abc import ABCMeta, abstractmethod

import numpy as np
Expand All @@ -18,6 +19,7 @@
from sklearn.utils.validation import check_is_fitted

from .utils import check_ratio, check_target_type, hash_X_y
from .utils.deprecation import deprecate_parameter


class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)):
Expand Down Expand Up @@ -61,7 +63,7 @@ def sample(self, X, y):
y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])

check_is_fitted(self, 'ratio_')
check_is_fitted(self, 'sampling_target_')
self._check_X_y(X, y)

output = self._sample(X, y)
Expand Down Expand Up @@ -143,10 +145,26 @@ class BaseSampler(SamplerMixin):
instead.
"""

def __init__(self, ratio='auto'):
def __init__(self, sampling_target='auto', ratio=None):
self.sampling_target = sampling_target
# FIXME: remove in 0.6
self.ratio = ratio
self.logger = logging.getLogger(self.__module__)

@property
def ratio_(self):
# FIXME: remove in 0.6
warnings.warn("'ratio' and 'ratio_' are deprecated. "
"Use 'sampling_target' and 'sampling_target_' instead.",
DeprecationWarning)
return self.sampling_target_

def _deprecate_ratio(self):
# both ratio and sampling_target should not be set
if self.ratio is not None:
deprecate_parameter(self, '0.4', 'ratio', 'sampling_target')
self.sampling_target = self.ratio

def fit(self, X, y):
"""Find the classes statistics before to perform sampling.

Expand All @@ -164,11 +182,13 @@ def fit(self, X, y):
Return self.

"""
self._deprecate_ratio()
y = check_target_type(y)
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
self.X_hash_, self.y_hash_ = hash_X_y(X, y)
# self.sampling_type is already checked in check_ratio
self.ratio_ = check_ratio(self.ratio, y, self._sampling_type)
self.sampling_target_ = check_ratio(self.sampling_target, y,
self._sampling_type)

return self

Expand Down Expand Up @@ -250,15 +270,23 @@ def fit(self, X, y):
if self.accept_sparse else False)
self.X_hash_, self.y_hash_ = hash_X_y(X, y)
# when using a sampler, ratio_ is supposed to exist after fit
self.ratio_ = 'is_fitted'
self.sampling_target_ = 'is_fitted'

return self

@property
def ratio_(self):
# FIXME: remove in 0.6
warnings.warn("'ratio' and 'ratio_' are deprecated. "
"Use 'sampling_target' and 'sampling_target_' instead.",
DeprecationWarning)
return self.sampling_target_

def _sample(self, X, y, func=None, kw_args=None):
y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']
if self.accept_sparse else False)
check_is_fitted(self, 'ratio_')
check_is_fitted(self, 'sampling_target_')
X_hash, y_hash = hash_X_y(X, y)
if self.X_hash_ != X_hash or self.y_hash_ != y_hash:
raise RuntimeError("X and y need to be same array earlier fitted.")
Expand Down
25 changes: 19 additions & 6 deletions imblearn/combine/smote_enn.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from __future__ import division

import logging
import warnings

from sklearn.utils import check_X_y

Expand Down Expand Up @@ -93,15 +94,17 @@ class SMOTEENN(SamplerMixin):
"""

def __init__(self,
ratio='auto',
sampling_target='auto',
random_state=None,
smote=None,
enn=None):
enn=None,
ratio=None):
super(SMOTEENN, self).__init__()
self.ratio = ratio
self.sampling_target = sampling_target
self.random_state = random_state
self.smote = smote
self.enn = enn
self.ratio = ratio
self.logger = logging.getLogger(__name__)

def _validate_estimator(self):
Expand All @@ -115,7 +118,9 @@ def _validate_estimator(self):
# Otherwise create a default SMOTE
else:
self.smote_ = SMOTE(
ratio=self.ratio, random_state=self.random_state)
sampling_target=self.sampling_target,
random_state=self.random_state,
ratio=self.ratio)

if self.enn is not None:
if isinstance(self.enn, EditedNearestNeighbours):
Expand All @@ -125,7 +130,15 @@ def _validate_estimator(self):
' Got {} instead.'.format(type(self.enn)))
# Otherwise create a default EditedNearestNeighbours
else:
self.enn_ = EditedNearestNeighbours(ratio='all')
self.enn_ = EditedNearestNeighbours(sampling_target='all')

@property
def ratio_(self):
# FIXME: remove in 0.6
warnings.warn("'ratio' and 'ratio_' are deprecated. "
"Use 'sampling_target' and 'sampling_target_' instead.",
DeprecationWarning)
return self.sampling_target_

def fit(self, X, y):
"""Find the classes statistics before to perform sampling.
Expand All @@ -146,7 +159,7 @@ def fit(self, X, y):
"""
y = check_target_type(y)
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
self.ratio_ = self.ratio
self.sampling_target_ = self.sampling_target
self.X_hash_, self.y_hash_ = hash_X_y(X, y)

return self
Expand Down
25 changes: 19 additions & 6 deletions imblearn/combine/smote_tomek.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from __future__ import division

import logging
import warnings

from sklearn.utils import check_X_y

Expand Down Expand Up @@ -100,15 +101,17 @@ class SMOTETomek(SamplerMixin):
"""

def __init__(self,
ratio='auto',
sampling_target='auto',
random_state=None,
smote=None,
tomek=None):
tomek=None,
ratio=None):
super(SMOTETomek, self).__init__()
self.ratio = ratio
self.sampling_target = sampling_target
self.random_state = random_state
self.smote = smote
self.tomek = tomek
self.ratio = ratio
self.logger = logging.getLogger(__name__)

def _validate_estimator(self):
Expand All @@ -123,7 +126,9 @@ def _validate_estimator(self):
# Otherwise create a default SMOTE
else:
self.smote_ = SMOTE(
ratio=self.ratio, random_state=self.random_state)
sampling_target=self.sampling_target,
random_state=self.random_state,
ratio=self.ratio)

if self.tomek is not None:
if isinstance(self.tomek, TomekLinks):
Expand All @@ -133,7 +138,15 @@ def _validate_estimator(self):
'Got {} instead.'.format(type(self.tomek)))
# Otherwise create a default TomekLinks
else:
self.tomek_ = TomekLinks(ratio='all')
self.tomek_ = TomekLinks(sampling_target='all')

@property
def ratio_(self):
# FIXME: remove in 0.6
warnings.warn("'ratio' and 'ratio_' are deprecated. "
"Use 'sampling_target' and 'sampling_target_' instead.",
DeprecationWarning)
return self.sampling_target_

def fit(self, X, y):
"""Find the classes statistics before to perform sampling.
Expand All @@ -154,7 +167,7 @@ def fit(self, X, y):
"""
y = check_target_type(y)
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
self.ratio_ = self.ratio
self.sampling_target_ = self.sampling_target
self.X_hash_, self.y_hash_ = hash_X_y(X, y)

return self
Expand Down
20 changes: 12 additions & 8 deletions imblearn/ensemble/balance_cascade.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,12 +108,14 @@ class BalanceCascade(BaseEnsembleSampler):
"""

def __init__(self,
ratio='auto',
sampling_target='auto',
return_indices=False,
random_state=None,
n_max_subset=None,
estimator=None):
super(BalanceCascade, self).__init__(ratio=ratio)
estimator=None,
ratio=None):
super(BalanceCascade, self).__init__(sampling_target=sampling_target,
ratio=ratio)
self.random_state = random_state
self.return_indices = return_indices
self.estimator = estimator
Expand All @@ -138,7 +140,8 @@ def fit(self, X, y):
"""
super(BalanceCascade, self).fit(X, y)
y = check_target_type(y)
self.ratio_ = check_ratio(self.ratio, y, 'under-sampling')
self.sampling_target_ = check_ratio(self.sampling_target, y,
'under-sampling')
return self

def _validate_estimator(self):
Expand Down Expand Up @@ -201,8 +204,8 @@ def _sample(self, X, y):
# value which will be picked at each round
index_constant = np.empty((0, ), dtype=y.dtype)
for target_class in target_stats.keys():
if target_class in self.ratio_.keys():
n_samples = self.ratio_[target_class]
if target_class in self.sampling_target_.keys():
n_samples = self.sampling_target_[target_class]
# extract the data of interest for this round from the
# current class
index_class = np.flatnonzero(y == target_class)
Expand Down Expand Up @@ -246,8 +249,9 @@ def _sample(self, X, y):
# check that there is enough samples for another round
target_stats = Counter(safe_indexing(
y, np.flatnonzero(samples_mask)))
for target_class in self.ratio_.keys():
if target_stats[target_class] < self.ratio_[target_class]:
for target_class in self.sampling_target_.keys():
if (target_stats[target_class] <
self.sampling_target_[target_class]):
b_subset_search = False

X_resampled, y_resampled = [], []
Expand Down
11 changes: 10 additions & 1 deletion imblearn/ensemble/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
# License: MIT

import warnings

import numpy as np

from sklearn.preprocessing import label_binarize
Expand All @@ -23,6 +25,13 @@ class BaseEnsembleSampler(BaseSampler):

_sampling_type = 'ensemble'

@property
def ratio_(self):
warnings.warn("'ratio' and 'ratio_' are deprecated. "
"Use 'sampling_target' and 'sampling_target_' instead.",
DeprecationWarning)
return self.sampling_target_

def sample(self, X, y):
"""Resample the dataset.

Expand All @@ -49,7 +58,7 @@ def sample(self, X, y):
y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])

check_is_fitted(self, 'ratio_')
check_is_fitted(self, 'sampling_target_')
self._check_X_y(X, y)

output = self._sample(X, y)
Expand Down
12 changes: 8 additions & 4 deletions imblearn/ensemble/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,11 +192,12 @@ def __init__(self,
bootstrap_features=False,
oob_score=False,
warm_start=False,
ratio='auto',
sampling_target='auto',
replacement=False,
n_jobs=1,
random_state=None,
verbose=0):
verbose=0,
ratio=None):

super(BaggingClassifier, self).__init__(
base_estimator,
Expand All @@ -210,6 +211,7 @@ def __init__(self,
n_jobs=n_jobs,
random_state=random_state,
verbose=verbose)
self.sampling_target = sampling_target
self.ratio = ratio
self.replacement = replacement

Expand All @@ -230,8 +232,10 @@ def _validate_estimator(self, default=DecisionTreeClassifier()):
base_estimator = clone(default)

self.base_estimator_ = Pipeline(
[('sampler', RandomUnderSampler(ratio=self.ratio,
replacement=self.replacement)),
[('sampler', RandomUnderSampler(
sampling_target=self.sampling_target,
replacement=self.replacement,
ratio=self.ratio)),
('classifier', base_estimator)])

def fit(self, X, y):
Expand Down
10 changes: 6 additions & 4 deletions imblearn/ensemble/easy_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,14 @@ class EasyEnsemble(BaseEnsembleSampler):
"""

def __init__(self,
ratio='auto',
sampling_target='auto',
return_indices=False,
random_state=None,
replacement=False,
n_subsets=10):
super(EasyEnsemble, self).__init__(ratio=ratio)
n_subsets=10,
ratio=None):
super(EasyEnsemble, self).__init__(sampling_target=sampling_target,
ratio=ratio)
self.random_state = random_state
self.return_indices = return_indices
self.replacement = replacement
Expand Down Expand Up @@ -142,7 +144,7 @@ def _sample(self, X, y):

for _ in range(self.n_subsets):
rus = RandomUnderSampler(
ratio=self.ratio_, return_indices=True,
sampling_target=self.sampling_target_, return_indices=True,
random_state=random_state.randint(MAX_INT),
replacement=self.replacement)
sel_x, sel_y, sel_idx = rus.fit_sample(X, y)
Expand Down
10 changes: 6 additions & 4 deletions imblearn/over_sampling/adasyn.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,11 +98,13 @@ class ADASYN(BaseOverSampler):
"""

def __init__(self,
ratio='auto',
sampling_target='auto',
random_state=None,
n_neighbors=5,
n_jobs=1):
super(ADASYN, self).__init__(ratio=ratio)
n_jobs=1,
ratio=None):
super(ADASYN, self).__init__(sampling_target=sampling_target,
ratio=ratio)
self.random_state = random_state
self.n_neighbors = n_neighbors
self.n_jobs = n_jobs
Expand Down Expand Up @@ -141,7 +143,7 @@ def _sample(self, X, y):
X_resampled = X.copy()
y_resampled = y.copy()

for class_sample, n_samples in self.ratio_.items():
for class_sample, n_samples in self.sampling_target_.items():
if n_samples == 0:
continue
target_class_indices = np.flatnonzero(y == class_sample)
Expand Down
Loading