scikit-learn-contrib · glemaitre · Dec 31, 2016 · Dec 26, 2016 · Dec 26, 2016 · Dec 26, 2016
diff --git a/doc/api.rst b/doc/api.rst
@@ -133,7 +133,8 @@ Metrics
 Functions
 ---------
 .. autosummary::
-:toctree: generated/
+ :toctree: generated/
+
  metrics.sensitivity_specificity_support
  metrics.sensitivity_score
  metrics.specificity_score

diff --git a/examples/applications/plot_multi_class_under_sampling.py b/examples/applications/plot_multi_class_under_sampling.py
@@ -0,0 +1,40 @@
+"""
+=============================================
+Multiclass classification with under-sampling
+=============================================
+
+Some balancing methods allow for balancing dataset with multiples classes.
+We provide an example to illustrate the use of those methods which do
+not differ from the binary case.
+
+"""
+
+from sklearn.datasets import load_iris
+from sklearn.svm import LinearSVC
+from sklearn.model_selection import train_test_split
+
+from imblearn.under_sampling import NearMiss
+from imblearn.pipeline import make_pipeline
+from imblearn.metrics import classification_report_imbalanced
+
+print(__doc__)
+
+RANDOM_STATE = 42
+
+# Create a folder to fetch the dataset
+iris = load_iris()
+# Make the dataset imbalanced
+# Select only half of the first class
+iris.data = iris.data[25:-1, :]
+iris.target = iris.target[25:-1]
+
+X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,
+ random_state=RANDOM_STATE)
+
+# Create a pipeline
+pipeline = make_pipeline(NearMiss(version=2, random_state=RANDOM_STATE),
+ LinearSVC(random_state=RANDOM_STATE))
+pipeline.fit(X_train, y_train)
+
+# Classify and report the results
+print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))
diff --git a/examples/datasets/README.txt b/examples/datasets/README.txt
@@ -0,0 +1,6 @@
+.. _dataset_examples:
+
+Dataset examples
+-----------------------
+
+Examples concerning the :mod:`imblearn.datasets` module.
diff --git a/examples/evaluation/plot_classification_report.py b/examples/evaluation/plot_classification_report.py
@@ -0,0 +1,45 @@
+"""
+=============================================
+Evaluate classification by compiling a report
+=============================================
+
+Specific metrics have been developed to evaluate classifier which has been
+trained using imbalanced data. `imblearn` provides a classification
+report similar to `sklearn`, with additional metrics specific to imbalanced
+learning problem.
+"""
+
+from sklearn import datasets
+from sklearn.svm import LinearSVC
+from sklearn.model_selection import train_test_split
+
+from imblearn import over_sampling as os
+from imblearn import pipeline as pl
+from imblearn.metrics import classification_report_imbalanced
+
+print(__doc__)
+
+RANDOM_STATE = 42
+
+# Generate a dataset
+X, y = datasets.make_classification(n_classes=2, class_sep=2,
+ weights=[0.1, 0.9], n_informative=10,
+ n_redundant=1, flip_y=0, n_features=20,
+ n_clusters_per_class=4, n_samples=5000,
+ random_state=RANDOM_STATE)
+
+pipeline = pl.make_pipeline(os.SMOTE(random_state=RANDOM_STATE),
+ LinearSVC(random_state=RANDOM_STATE))
+
+# Split the data
+X_train, X_test, y_train, y_test = train_test_split(X, y,
+ random_state=RANDOM_STATE)
+
+# Train the classifier with balancing
+pipeline.fit(X_train, y_train)
+
+# Test the classifier and get the prediction
+y_pred_bal = pipeline.predict(X_test)
+
+# Show the classification report
+print(classification_report_imbalanced(y_test, y_pred_bal))
diff --git a/examples/evaluation/plot_metrics.py b/examples/evaluation/plot_metrics.py
@@ -0,0 +1,74 @@
+"""
+=======================================
+Metrics specific to imbalanced learning
+=======================================
+
+Specific metrics have been developed to evaluate classifier which
+has been trained using imbalanced data. `imblearn` provides mainly
+two additional metrics which are not implemented in `sklearn`: (i)
+geometric mean and (ii) index balanced accuracy.
+"""
+
+from sklearn import datasets
+from sklearn.svm import LinearSVC
+from sklearn.model_selection import train_test_split
+
+from imblearn import over_sampling as os
+from imblearn import pipeline as pl
+from imblearn.metrics import (geometric_mean_score,
+ make_index_balanced_accuracy)
+
+print(__doc__)
+
+RANDOM_STATE = 42
+
+# Generate a dataset
+X, y = datasets.make_classification(n_classes=3, class_sep=2,
+ weights=[0.1, 0.9], n_informative=10,
+ n_redundant=1, flip_y=0, n_features=20,
+ n_clusters_per_class=4, n_samples=5000,
+ random_state=RANDOM_STATE)
+
+pipeline = pl.make_pipeline(os.SMOTE(random_state=RANDOM_STATE),
+ LinearSVC(random_state=RANDOM_STATE))
+
+# Split the data
+X_train, X_test, y_train, y_test = train_test_split(X, y,
+ random_state=RANDOM_STATE)
+
+# Train the classifier with balancing
+pipeline.fit(X_train, y_train)
+
+# Test the classifier and get the prediction
+y_pred_bal = pipeline.predict(X_test)
+
+###############################################################################
+# The geometric mean corresponds to the square root of the product of the
+# sensitivity and specificity. Combining the two metrics should account for
+# the balancing of the dataset.
+
+print('The geometric mean is {}'.format(geometric_mean_score(
+ y_test,
+ y_pred_bal)))
+
+###############################################################################
+# The index balanced accuracy can transform any metric to be used in
+# imbalanced learning problems.
+
+alpha = 0.1
+geo_mean = make_index_balanced_accuracy(alpha=alpha, squared=True)(
+ geometric_mean_score)
+
+print('The IBA using alpha = {} and the geometric mean: {}'.format(
+ alpha, geo_mean(
+ y_test,
+ y_pred_bal)))
+
+alpha = 0.5
+geo_mean = make_index_balanced_accuracy(alpha=alpha, squared=True)(
+ geometric_mean_score)
+
+print('The IBA using alpha = {} and the geometric mean: {}'.format(
+ alpha, geo_mean(
+ y_test,
+ y_pred_bal)))
diff --git a/examples/model_selection/plot_validation_curve.py b/examples/model_selection/plot_validation_curve.py
@@ -29,7 +29,7 @@
  weights=[0.1, 0.9], n_informative=10,
  n_redundant=1, flip_y=0, n_features=20,
  n_clusters_per_class=4, n_samples=5000,
- random_state=10)
+ random_state=RANDOM_STATE)
 smote = os.SMOTE(random_state=RANDOM_STATE)
 cart = tree.DecisionTreeClassifier(random_state=RANDOM_STATE)
 pipeline = pl.make_pipeline(smote, cart)

diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py
@@ -14,6 +14,8 @@
 import logging
 import functools
 
+from inspect import getcallargs
+
 import numpy as np
 
 from sklearn.metrics.classification import (_check_targets, _prf_divide,
@@ -22,6 +24,12 @@
 from sklearn.utils.fixes import bincount
 from sklearn.utils.multiclass import unique_labels
 
+try:
+ from inspect import signature
+except ImportError:
+ from sklearn.externals.funcsigs import signature
+
+
 LOGGER = logging.getLogger(__name__)
 
 
@@ -563,10 +571,10 @@ def geometric_mean_score(y_true,
 
 
 def make_index_balanced_accuracy(alpha=0.1, squared=True):
- """Balance any scoring function using the indexed balanced accuracy
+ """Balance any scoring function using the index balanced accuracy
 
  This factory function wraps scoring function to express it as the
- indexed balanced accuracy (IBA). You need to use this function to
+ index balanced accuracy (IBA). You need to use this function to
  decorate any scoring function.
 
  Parameters
@@ -582,7 +590,7 @@ def make_index_balanced_accuracy(alpha=0.1, squared=True):
  -------
  iba_scoring_func : callable,
  Returns the scoring metric decorated which will automatically compute
- the indexed balanced accuracy.
+ the index balanced accuracy.
 
  Examples
  --------
@@ -603,21 +611,16 @@ def compute_score(*args, **kwargs):
  # Square if desired
  if squared:
  _score = np.power(_score, 2)
- # args will contain the y_pred and y_true
- # kwargs will contain the other parameters
- labels = kwargs.get('labels', None)
- pos_label = kwargs.get('pos_label', 1)
- average = kwargs.get('average', 'binary')
- sample_weight = kwargs.get('sample_weight', None)
- # Compute the sensitivity and specificity
- dict_sen_spe = {
- 'labels': labels,
- 'pos_label': pos_label,
- 'average': average,
- 'sample_weight': sample_weight
- }
- sen, spe, _ = sensitivity_specificity_support(*args,
- **dict_sen_spe)
+ # Create the list of tags
+ tags_scoring_func = getcallargs(scoring_func, *args, **kwargs)
+ # Get the signature of the sens/spec function
+ sens_spec_sig = signature(sensitivity_specificity_support)
+ # Filter the inputs required by the sens/spec function
+ tags_sens_spec = sens_spec_sig.bind(**tags_scoring_func)
+ # Call the sens/spec function
+ sen, spe, _ = sensitivity_specificity_support(
+ *tags_sens_spec.args,
+ **tags_sens_spec.kwargs)
  # Compute the dominance
  dom = sen - spe
  return (1. + alpha * dom) * _score
@@ -640,7 +643,7 @@ def classification_report_imbalanced(y_true,
  Specific metrics have been proposed to evaluate the classification
  performed on imbalanced dataset. This report compiles the
  state-of-the-art metrics: precision/recall/specificity, geometric
- mean, and indexed balanced accuracy of the
+ mean, and index balanced accuracy of the
  geometric mean.
 
  Parameters
@@ -674,7 +677,7 @@ def classification_report_imbalanced(y_true,
  -------
  report : string
  Text summary of the precision, recall, specificity, geometric mean,
- and indexed balanced accuracy.
+ and index balanced accuracy.
 
  Examples
  --------
@@ -746,7 +749,7 @@ class 2 1.00 0.67 1.00 0.80 0.82 0.69\
  labels=labels,
  average=None,
  sample_weight=sample_weight)
- # Indexed balanced accuracy
+ # Index balanced accuracy
  iba_gmean = make_index_balanced_accuracy(
  alpha=alpha, squared=True)(geometric_mean_score)
  iba = iba_gmean(