scikit-learn-contrib · glemaitre · Aug 24, 2017 · Aug 10, 2017 · Aug 11, 2017 · Aug 11, 2017
diff --git a/doc/api.rst b/doc/api.rst
@@ -109,6 +109,7 @@ Prototype selection
  :template: class.rst
 
  ensemble.BalanceCascade
+ ensemble.BalancedBaggingClassifier
  ensemble.EasyEnsemble
 
 

diff --git a/doc/ensemble.rst b/doc/ensemble.rst
@@ -6,6 +6,11 @@ Ensemble of samplers
 
 .. currentmodule:: imblearn.ensemble
 
+.. _ensemble_samplers:
+
+Samplers
+--------
+
 An imbalanced data set can be balanced by creating several balanced
 subsets. The module :mod:`imblearn.ensemble` allows to create such sets.
 
@@ -54,3 +59,54 @@ parameter ``n_max_subset`` and an additional bootstraping can be activated with
 See
 :ref:`sphx_glr_auto_examples_ensemble_plot_easy_ensemble.py` and
 :ref:`sphx_glr_auto_examples_ensemble_plot_balance_cascade.py`.
+
+.. _ensemble_meta_estimators:
+
+Chaining ensemble of samplers and estimators
+--------------------------------------------
+
+In ensemble classifiers, bagging methods build several estimators on different
+randomly selected subset of data. In scikit-learn, this classifier is named
+``BaggingClassifier``. However, this classifier does not allow to balance each
+subset of data. Therefore, when training on imbalanced data set, this
+classifier will favor the majority classes::
+
+ >>> from sklearn.model_selection import train_test_split
+ >>> from sklearn.metrics import confusion_matrix
+ >>> from sklearn.ensemble import BaggingClassifier
+ >>> from sklearn.tree import DecisionTreeClassifier
+ >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+ >>> bc = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
+ ... random_state=0)
+ >>> bc.fit(X_train, y_train) #doctest: +ELLIPSIS
+ BaggingClassifier(...)
+ >>> y_pred = bc.predict(X_test)
+ >>> confusion_matrix(y_test, y_pred)
+ array([[ 0, 0, 12],
+ [ 0, 0, 59],
+ [ 0, 0, 1179]])
+
+:class:`BalancedBaggingClassifier` allows to resample each subset of data
+before to train each estimator of the ensemble. In short, it combines the
+output of an :class:`EasyEnsemble` sampler with an ensemble of classifiers
+(i.e. ``BaggingClassifier``). Therefore, :class:`BalancedBaggingClassifier`
+takes the same parameters than the scikit-learn
+``BaggingClassifier``. Additionally, there is two additional parameters,
+``ratio`` and ``replacement``, as in the :class:`EasyEnsemble` sampler::
+
+
+ >>> from imblearn.ensemble import BalancedBaggingClassifier
+ >>> bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
+ ... ratio='auto',
+ ... replacement=False,
+ ... random_state=0)
+ >>> bbc.fit(X, y) # doctest: +ELLIPSIS
+ BalancedBaggingClassifier(...)
+ >>> y_pred = bbc.predict(X_test)
+ >>> confusion_matrix(y_test, y_pred)
+ array([[ 12, 0, 0],
+ [ 0, 55, 4],
+ [ 68, 53, 1058]])
+
+See
+:ref:`sphx_glr_auto_examples_ensemble_plot_comparison_bagging_classifier.py`.
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -53,6 +53,10 @@ New features
 Enhancement
 ~~~~~~~~~~~
 
+- Add :class:`ensemble.BalancedBaggingClassifier` which is a meta estimator to
+ directly use the :class:`ensemble.EasyEnsemble` chained with a classifier. By
+ `Guillaume Lemaitre`_.
+
 - All samplers accepts sparse matrices with defaulting on CSR type. By
  `Guillaume Lemaitre`_.
 

diff --git a/examples/ensemble/plot_comparison_bagging_classifier.py b/examples/ensemble/plot_comparison_bagging_classifier.py
@@ -0,0 +1,104 @@
+"""
+=========================================================
+Comparison of balanced and imbalanced bagging classifiers
+=========================================================
+
+This example shows the benefit of balancing the training set when using a
+bagging classifier. ``BalancedBaggingClassifier`` chains a
+``RandomUnderSampler`` and a given classifier while ``BaggingClassifier`` is
+using directly the imbalanced data.
+
+Balancing the data set before training the classifier improve the
+classification performance. In addition, it avoids the ensemble to focus on the
+majority class which would be a known drawback of the decision tree
+classifiers.
+
+"""
+
+# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
+# License: MIT
+
+from collections import Counter
+import itertools
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import BaggingClassifier
+from sklearn.metrics import confusion_matrix
+
+from imblearn.datasets import make_imbalance
+from imblearn.ensemble import BalancedBaggingClassifier
+
+from imblearn.metrics import classification_report_imbalanced
+
+
+def plot_confusion_matrix(cm, classes,
+ normalize=False,
+ title='Confusion matrix',
+ cmap=plt.cm.Blues):
+ """
+ This function prints and plots the confusion matrix.
+ Normalization can be applied by setting `normalize=True`.
+ """
+ if normalize:
+ cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+ print("Normalized confusion matrix")
+ else:
+ print('Confusion matrix, without normalization')
+
+ print(cm)
+
+ plt.imshow(cm, interpolation='nearest', cmap=cmap)
+ plt.title(title)
+ plt.colorbar()
+ tick_marks = np.arange(len(classes))
+ plt.xticks(tick_marks, classes, rotation=45)
+ plt.yticks(tick_marks, classes)
+
+ fmt = '.2f' if normalize else 'd'
+ thresh = cm.max() / 2.
+ for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
+ plt.text(j, i, format(cm[i, j], fmt),
+ horizontalalignment="center",
+ color="white" if cm[i, j] > thresh else "black")
+
+ plt.tight_layout()
+ plt.ylabel('True label')
+ plt.xlabel('Predicted label')
+
+
+iris = load_iris()
+X, y = make_imbalance(iris.data, iris.target, ratio={0: 25, 1: 40, 2: 50},
+ random_state=0)
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+bagging = BaggingClassifier(random_state=0)
+balanced_bagging = BalancedBaggingClassifier(random_state=0)
+
+print('Class distribution of the training set: {}'.format(Counter(y_train)))
+
+bagging.fit(X_train, y_train)
+balanced_bagging.fit(X_train, y_train)
+
+print('Class distribution of the test set: {}'.format(Counter(y_test)))
+
+print('Classification results using a bagging classifier on imbalanced data')
+y_pred_bagging = bagging.predict(X_test)
+print(classification_report_imbalanced(y_test, y_pred_bagging))
+cm_bagging = confusion_matrix(y_test, y_pred_bagging)
+plt.figure()
+plot_confusion_matrix(cm_bagging, classes=iris.target_names,
+ title='Confusion matrix using BaggingClassifier')
+
+print('Classification results using a bagging classifier on balanced data')
+y_pred_balanced_bagging = balanced_bagging.predict(X_test)
+print(classification_report_imbalanced(y_test, y_pred_balanced_bagging))
+cm_balanced_bagging = confusion_matrix(y_test, y_pred_balanced_bagging)
+plt.figure()
+plot_confusion_matrix(cm_balanced_bagging, classes=iris.target_names,
+ title='Confusion matrix using BalancedBaggingClassifier')
+
+plt.show()
diff --git a/imblearn/ensemble/__init__.py b/imblearn/ensemble/__init__.py
@@ -6,4 +6,6 @@
 from .easy_ensemble import EasyEnsemble
 from .balance_cascade import BalanceCascade
 
-__all__ = ['EasyEnsemble', 'BalanceCascade']
+from .classifier import BalancedBaggingClassifier
+
+__all__ = ['EasyEnsemble', 'BalancedBaggingClassifier', 'BalanceCascade']
diff --git a/imblearn/ensemble/balance_cascade.py b/imblearn/ensemble/balance_cascade.py
@@ -27,7 +27,7 @@ class BalanceCascade(BaseEnsembleSampler):
  This method iteratively select subset and make an ensemble of the
  different sets. The selection is performed using a specific classifier.
 
- Read more in the :ref:`User Guide <ensemble>`.
+ Read more in the :ref:`User Guide <ensemble_samplers>`.
 
  Parameters
  ----------
@@ -99,7 +99,7 @@ class BalanceCascade(BaseEnsembleSampler):
 
  See also
  --------
- EasyEnsemble
+ BalancedBaggingClassifier, EasyEnsemble
 
  References
  ----------