Skip to content
17 changes: 17 additions & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,20 @@ Functions
:toctree: generated/

datasets.make_imbalance


Utilities
=========

.. automodule:: imblearn.utils
:no-members:
:no-inherited-members:

.. currentmodule:: imblearn

Functions
---------
.. autosummary::
:toctree: generated/

utils.estimator_checks.check_estimator
37 changes: 30 additions & 7 deletions doc/whats_new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,36 @@
Release history
===============

.. _changes_0_3:

Changelog
---------

New features
~~~~~~~~~~~~

- Turn off steps in :class:`pipeline.Pipeline` using the `None`
object. By `Christos Aridas`_.

Enhancement
~~~~~~~~~~~

- All the unit tests have been factorized and a `check_estimators` has
been derived from scikit-learn. By `Guillaume Lemaitre`_.
- Script for automatic build of conda packages and uploading. By
`Guillaume Lemaitre`_

API changes summary
~~~~~~~~~~~~~~~~~~~

- `__init__` has been removed from the :class:`base.SamplerMixin` to
create a real mixin class. By `Guillaume Lemaitre`_.
- creation of a module `exceptions` to handle consistant raising of
errors. By `Guillaume Lemaitre`_.
- creation of a module `utils.validation` to make checking of
recurrent patterns. By `Guillaume Lemaitre`_.


.. _changes_0_2:

Version 0.2
Expand Down Expand Up @@ -32,7 +62,6 @@ New features

- Added AllKNN under sampling technique. By `Dayvid Oliveira`_.
- Added a module `metrics` implementing some specific scoring function for the problem of balancing. By `Guillaume Lemaitre`_ and `Christos Aridas`_.
- Turn off steps in :class:`pipeline.Pipeline` using the `None` object. By `Christos Aridas`_.

Enhancement
~~~~~~~~~~~
Expand All @@ -42,12 +71,6 @@ Enhancement
- Change from `cross_validation` module to `model_selection` module for
`sklearn` deprecation cycle. By `Dayvid Oliveira`_ and `Christos Aridas`_.

New features
~~~~~~~~~~~~

- Added AllKNN under sampling technique.
- Added support for bumpversion.

API changes summary
~~~~~~~~~~~~~~~~~~~

Expand Down
9 changes: 7 additions & 2 deletions imblearn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,18 @@
ensemble
Module which provides methods generating an ensemble of
under-sampled subsets.
exceptions
Module including custom warnings and error clases used across
imbalanced-learn.
metrics
Module which provides metrics to quantified the classification performance
with imbalanced dataset.
over_sampling
Module which provides methods to under-sample a dataset.
under-sampling
Module which provides methods to over-sample a dataset.
utils
Module including various utilities.
pipeline
Module which allowing to create pipeline with scikit-learn estimators.
"""
Expand All @@ -34,6 +39,6 @@

# list all submodules available in imblearn and version
__all__ = [
'combine', 'ensemble', 'metrics', 'over_sampling', 'under_sampling',
'pipeline', '__version__'
'combine', 'ensemble', 'exceptions', 'metrics', 'over_sampling',
'under_sampling', 'utils', 'pipeline', '__version__'
]
47 changes: 26 additions & 21 deletions imblearn/base.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
"""Base class for sampling"""

from __future__ import division, print_function
from __future__ import division

import logging
import warnings
from numbers import Real
from abc import ABCMeta, abstractmethod
from collections import Counter

Expand All @@ -12,6 +13,7 @@
from sklearn.externals import six
from sklearn.utils import check_X_y
from sklearn.utils.multiclass import type_of_target
from sklearn.utils.validation import check_is_fitted


class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)):
Expand Down Expand Up @@ -62,16 +64,9 @@ def fit(self, X, y):
self.logger.info('Compute classes statistics ...')

# Raise an error if there is only one class
# if uniques.size == 1:
# raise RuntimeError("Only one class detected, aborting...")
# Raise a warning for the moment to be compatible with BaseEstimator
self.logger.debug('The number of classes is %s', np.unique(y).size)
self.logger.debug('Shall we raise a warning: %s',
np.unique(y).size == 1)
if np.unique(y).size == 1:
warnings.simplefilter('always', UserWarning)
warnings.warn('Only one class detected, something will get wrong')
self.logger.debug('The warning should has been raised.')
if np.unique(y).size <= 1:
raise ValueError("Sampler can't balance when only one class is"
" present.")

# Store the size of X to check at sampling time if we have the
# same data
Expand All @@ -88,12 +83,16 @@ def fit(self, X, y):
np.unique(y).size, self.stats_c_)

# Check if the ratio provided at initialisation make sense
if isinstance(self.ratio, float):
if isinstance(self.ratio, Real):
if self.ratio < (self.stats_c_[self.min_c_] /
self.stats_c_[self.maj_c_]):
raise RuntimeError('The ratio requested at initialisation'
' should be greater or equal than the'
' balancing ratio of the current data.')
' balancing ratio of the current data.'
' Got {} < {}.'.format(
self.ratio,
self.stats_c_[self.min_c_] /
self.stats_c_[self.maj_c_]))

return self

Expand Down Expand Up @@ -122,14 +121,14 @@ def sample(self, X, y):
X, y = check_X_y(X, y)

# Check that the data have been fitted
if not hasattr(self, 'stats_c_'):
raise RuntimeError('You need to fit the data, first!!!')
check_is_fitted(self, 'stats_c_')

# Check if the size of the data is identical than at fitting
if X.shape != self.X_shape_:
raise RuntimeError('The data that you attempt to resample do not'
' seem to be the one earlier fitted. Use the'
' fitted data.')
' fitted data. Shape of data is {}, got {}'
' instead.'.format(X.shape, self.X_shape_))

if hasattr(self, 'ratio'):
self._validate_ratio()
Expand Down Expand Up @@ -170,17 +169,23 @@ def _validate_ratio(self):
# The ratio correspond to the number of samples in the minority class
# over the number of samples in the majority class. Thus, the ratio
# cannot be greater than 1.0
if isinstance(self.ratio, float):
if isinstance(self.ratio, Real):
if self.ratio > 1:
raise ValueError('Ration cannot be greater than one.')
raise ValueError('Ratio cannot be greater than one.'
' Got {}.'.format(self.ratio))
elif self.ratio <= 0:
raise ValueError('Ratio cannot be negative.')
raise ValueError('Ratio cannot be negative.'
' Got {}.'.format(self.ratio))

elif isinstance(self.ratio, six.string_types):
if self.ratio != 'auto':
raise ValueError('Unknown string for the parameter ratio.')
raise ValueError("Unknown string for the parameter ratio."
" Got {} instead of 'auto'".format(
self.ratio))
else:
raise ValueError('Unknown parameter type for ratio.')
raise ValueError('Unknown parameter type for ratio.'
' Got {} instead of float or str'.format(
type(self.ratio)))

def _validate_size_ngh_deprecation(self):
"Private function to warn about the deprecation about size_ngh."
Expand Down
6 changes: 4 additions & 2 deletions imblearn/combine/smote_enn.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,8 @@ def _validate_estimator(self):
if isinstance(self.smote, SMOTE):
self.smote_ = self.smote
else:
raise ValueError('smote needs to be a SMOTE object.')
raise ValueError('smote needs to be a SMOTE object.'
'Got {} instead.'.format(type(self.smote)))
# Otherwise create a default SMOTE
else:
self.smote_ = SMOTE(
Expand Down Expand Up @@ -234,7 +235,8 @@ def _validate_estimator(self):
if isinstance(self.enn, EditedNearestNeighbours):
self.enn_ = self.enn
else:
raise ValueError('enn needs to be an EditedNearestNeighbours.')
raise ValueError('enn needs to be an EditedNearestNeighbours.'
' Got {} instead.'.format(type(self.enn)))
# Otherwise create a default EditedNearestNeighbours
else:
self.enn_ = EditedNearestNeighbours(random_state=self.random_state)
Expand Down
8 changes: 5 additions & 3 deletions imblearn/combine/smote_tomek.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,8 @@ def _validate_estimator(self):
if isinstance(self.smote, SMOTE):
self.smote_ = self.smote
else:
raise ValueError('smote needs to be a SMOTE object.')
raise ValueError('smote needs to be a SMOTE object.'
'Got {} instead.'.format(type(self.smote)))
# Otherwise create a default SMOTE
else:
self.smote_ = SMOTE(
Expand All @@ -192,8 +193,9 @@ def _validate_estimator(self):
if isinstance(self.tomek, TomekLinks):
self.tomek_ = self.tomek
else:
raise ValueError('tomek needs to be a TomekLinks object.')
# Otherwise create a default EditedNearestNeighbours
raise ValueError('tomek needs to be a TomekLinks object.'
'Got {} instead.'.format(type(self.tomek)))
# Otherwise create a default TomekLinks
else:
self.tomek_ = TomekLinks(random_state=self.random_state)

Expand Down
Loading