scikit-learn-contrib · chkoar · Mar 19, 2017 · Mar 13, 2017 · Mar 15, 2017 · Mar 16, 2017
diff --git a/doc/api.rst b/doc/api.rst
@@ -158,3 +158,20 @@ Functions
  :toctree: generated/
 
  datasets.make_imbalance
+
+
+Utilities
+=========
+
+.. automodule:: imblearn.utils
+ :no-members:
+ :no-inherited-members:
+
+.. currentmodule:: imblearn
+
+Functions
+---------
+.. autosummary::
+ :toctree: generated/
+
+ utils.estimator_checks.check_estimator
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -4,6 +4,36 @@
 Release history
 ===============
 
+.. _changes_0_3:
+
+Changelog
+---------
+
+New features
+~~~~~~~~~~~~
+
+- Turn off steps in :class:`pipeline.Pipeline` using the `None`
+ object. By `Christos Aridas`_.
+
+Enhancement
+~~~~~~~~~~~
+
+- All the unit tests have been factorized and a `check_estimators` has
+ been derived from scikit-learn. By `Guillaume Lemaitre`_.
+- Script for automatic build of conda packages and uploading. By
+ `Guillaume Lemaitre`_
+
+API changes summary
+~~~~~~~~~~~~~~~~~~~
+
+- `__init__` has been removed from the :class:`base.SamplerMixin` to
+ create a real mixin class. By `Guillaume Lemaitre`_.
+- creation of a module `exceptions` to handle consistant raising of
+ errors. By `Guillaume Lemaitre`_.
+- creation of a module `utils.validation` to make checking of
+ recurrent patterns. By `Guillaume Lemaitre`_.
+
+
 .. _changes_0_2:
 
 Version 0.2
@@ -32,7 +62,6 @@ New features
 
 - Added AllKNN under sampling technique. By `Dayvid Oliveira`_.
 - Added a module `metrics` implementing some specific scoring function for the problem of balancing. By `Guillaume Lemaitre`_ and `Christos Aridas`_.
-- Turn off steps in :class:`pipeline.Pipeline` using the `None` object. By `Christos Aridas`_.
 
 Enhancement
 ~~~~~~~~~~~
@@ -42,12 +71,6 @@ Enhancement
 - Change from `cross_validation` module to `model_selection` module for
  `sklearn` deprecation cycle. By `Dayvid Oliveira`_ and `Christos Aridas`_.
 
-New features
-~~~~~~~~~~~~
-
-- Added AllKNN under sampling technique.
-- Added support for bumpversion.
-
 API changes summary
 ~~~~~~~~~~~~~~~~~~~
 

diff --git a/imblearn/__init__.py b/imblearn/__init__.py
@@ -10,13 +10,18 @@
 ensemble
  Module which provides methods generating an ensemble of
  under-sampled subsets.
+exceptions
+ Module including custom warnings and error clases used across
+ imbalanced-learn.
 metrics
  Module which provides metrics to quantified the classification performance
  with imbalanced dataset.
 over_sampling
  Module which provides methods to under-sample a dataset.
 under-sampling
  Module which provides methods to over-sample a dataset.
+utils
+ Module including various utilities.
 pipeline
  Module which allowing to create pipeline with scikit-learn estimators.
 """
@@ -34,6 +39,6 @@
 
 # list all submodules available in imblearn and version
 __all__ = [
- 'combine', 'ensemble', 'metrics', 'over_sampling', 'under_sampling',
- 'pipeline', '__version__'
+ 'combine', 'ensemble', 'exceptions', 'metrics', 'over_sampling',
+ 'under_sampling', 'utils', 'pipeline', '__version__'
 ]
diff --git a/imblearn/base.py b/imblearn/base.py
@@ -1,9 +1,10 @@
 """Base class for sampling"""
 
-from __future__ import division, print_function
+from __future__ import division
 
 import logging
 import warnings
+from numbers import Real
 from abc import ABCMeta, abstractmethod
 from collections import Counter
 
@@ -12,6 +13,7 @@
 from sklearn.externals import six
 from sklearn.utils import check_X_y
 from sklearn.utils.multiclass import type_of_target
+from sklearn.utils.validation import check_is_fitted
 
 
 class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)):
@@ -62,16 +64,9 @@ def fit(self, X, y):
  self.logger.info('Compute classes statistics ...')
 
  # Raise an error if there is only one class
- # if uniques.size == 1:
- # raise RuntimeError("Only one class detected, aborting...")
- # Raise a warning for the moment to be compatible with BaseEstimator
- self.logger.debug('The number of classes is %s', np.unique(y).size)
- self.logger.debug('Shall we raise a warning: %s',
- np.unique(y).size == 1)
- if np.unique(y).size == 1:
- warnings.simplefilter('always', UserWarning)
- warnings.warn('Only one class detected, something will get wrong')
- self.logger.debug('The warning should has been raised.')
+ if np.unique(y).size <= 1:
+ raise ValueError("Sampler can't balance when only one class is"
+ " present.")
 
  # Store the size of X to check at sampling time if we have the
  # same data
@@ -88,12 +83,16 @@ def fit(self, X, y):
  np.unique(y).size, self.stats_c_)
 
  # Check if the ratio provided at initialisation make sense
- if isinstance(self.ratio, float):
+ if isinstance(self.ratio, Real):
  if self.ratio < (self.stats_c_[self.min_c_] /
  self.stats_c_[self.maj_c_]):
  raise RuntimeError('The ratio requested at initialisation'
  ' should be greater or equal than the'
- ' balancing ratio of the current data.')
+ ' balancing ratio of the current data.'
+ ' Got {} < {}.'.format(
+ self.ratio,
+ self.stats_c_[self.min_c_] /
+ self.stats_c_[self.maj_c_]))
 
  return self
 
@@ -122,14 +121,14 @@ def sample(self, X, y):
  X, y = check_X_y(X, y)
 
  # Check that the data have been fitted
- if not hasattr(self, 'stats_c_'):
- raise RuntimeError('You need to fit the data, first!!!')
+ check_is_fitted(self, 'stats_c_')
 
  # Check if the size of the data is identical than at fitting
  if X.shape != self.X_shape_:
  raise RuntimeError('The data that you attempt to resample do not'
  ' seem to be the one earlier fitted. Use the'
- ' fitted data.')
+ ' fitted data. Shape of data is {}, got {}'
+ ' instead.'.format(X.shape, self.X_shape_))
 
  if hasattr(self, 'ratio'):
  self._validate_ratio()
@@ -170,17 +169,23 @@ def _validate_ratio(self):
  # The ratio correspond to the number of samples in the minority class
  # over the number of samples in the majority class. Thus, the ratio
  # cannot be greater than 1.0
- if isinstance(self.ratio, float):
+ if isinstance(self.ratio, Real):
  if self.ratio > 1:
- raise ValueError('Ration cannot be greater than one.')
+ raise ValueError('Ratio cannot be greater than one.'
+ ' Got {}.'.format(self.ratio))
  elif self.ratio <= 0:
- raise ValueError('Ratio cannot be negative.')
+ raise ValueError('Ratio cannot be negative.'
+ ' Got {}.'.format(self.ratio))
 
  elif isinstance(self.ratio, six.string_types):
  if self.ratio != 'auto':
- raise ValueError('Unknown string for the parameter ratio.')
+ raise ValueError("Unknown string for the parameter ratio."
+ " Got {} instead of 'auto'".format(
+ self.ratio))
  else:
- raise ValueError('Unknown parameter type for ratio.')
+ raise ValueError('Unknown parameter type for ratio.'
+ ' Got {} instead of float or str'.format(
+ type(self.ratio)))
 
  def _validate_size_ngh_deprecation(self):
  "Private function to warn about the deprecation about size_ngh."

diff --git a/imblearn/combine/smote_enn.py b/imblearn/combine/smote_enn.py
@@ -202,7 +202,8 @@ def _validate_estimator(self):
  if isinstance(self.smote, SMOTE):
  self.smote_ = self.smote
  else:
- raise ValueError('smote needs to be a SMOTE object.')
+ raise ValueError('smote needs to be a SMOTE object.'
+ 'Got {} instead.'.format(type(self.smote)))
  # Otherwise create a default SMOTE
  else:
  self.smote_ = SMOTE(
@@ -234,7 +235,8 @@ def _validate_estimator(self):
  if isinstance(self.enn, EditedNearestNeighbours):
  self.enn_ = self.enn
  else:
- raise ValueError('enn needs to be an EditedNearestNeighbours.')
+ raise ValueError('enn needs to be an EditedNearestNeighbours.'
+ ' Got {} instead.'.format(type(self.enn)))
  # Otherwise create a default EditedNearestNeighbours
  else:
  self.enn_ = EditedNearestNeighbours(random_state=self.random_state)

diff --git a/imblearn/combine/smote_tomek.py b/imblearn/combine/smote_tomek.py
@@ -173,7 +173,8 @@ def _validate_estimator(self):
  if isinstance(self.smote, SMOTE):
  self.smote_ = self.smote
  else:
- raise ValueError('smote needs to be a SMOTE object.')
+ raise ValueError('smote needs to be a SMOTE object.'
+ 'Got {} instead.'.format(type(self.smote)))
  # Otherwise create a default SMOTE
  else:
  self.smote_ = SMOTE(
@@ -192,8 +193,9 @@ def _validate_estimator(self):
  if isinstance(self.tomek, TomekLinks):
  self.tomek_ = self.tomek
  else:
- raise ValueError('tomek needs to be a TomekLinks object.')
- # Otherwise create a default EditedNearestNeighbours
+ raise ValueError('tomek needs to be a TomekLinks object.'
+ 'Got {} instead.'.format(type(self.tomek)))
+ # Otherwise create a default TomekLinks
  else:
  self.tomek_ = TomekLinks(random_state=self.random_state)