scikit-learn-contrib · glemaitre · Jun 24, 2016 · Jun 24, 2016 · Jun 24, 2016 · Jun 24, 2016
diff --git a/examples/under-sampling/plot_repeated_edited_nearest_neighbours.py b/examples/under-sampling/plot_repeated_edited_nearest_neighbours.py
@@ -0,0 +1,74 @@
+"""
+=========================
+Repeated Edited nearest-neighbours
+=========================
+
+An illustration of the repeated edited nearest-neighbours method.
+
+"""
+
+print(__doc__)
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+sns.set()
+
+# Define some color for the plotting
+almost_black = '#262626'
+palette = sns.color_palette()
+
+from sklearn.datasets import make_classification
+from sklearn.decomposition import PCA
+
+from unbalanced_dataset.under_sampling import EditedNearestNeighbours, \
+ RepeatedEditedNearestNeighbours
+
+# Generate the dataset
+X, y = make_classification(n_classes=2, class_sep=1.25, weights=[0.3, 0.7],
+ n_informative=3, n_redundant=1, flip_y=0,
+ n_features=5, n_clusters_per_class=1,
+ n_samples=5000, random_state=10)
+
+# Instanciate a PCA object for the sake of easy visualisation
+pca = PCA(n_components=2)
+# Fit and transform x to visualise inside a 2D feature space
+X_vis = pca.fit_transform(X)
+
+# Three subplots, unpack the axes array immediately
+f, (ax1, ax2, ax3) = plt.subplots(1, 3)
+
+ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=.5,
+ edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
+ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=.5,
+ edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
+ax1.set_title('Original set')
+
+# Apply the ENN
+print('ENN')
+enn = EditedNearestNeighbours()
+X_resampled, y_resampled = enn.fit_transform(X, y)
+X_res_vis = pca.transform(X_resampled)
+
+ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
+ label="Class #0", alpha=.5, edgecolor=almost_black,
+ facecolor=palette[0], linewidth=0.15)
+ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
+ label="Class #1", alpha=.5, edgecolor=almost_black,
+ facecolor=palette[2], linewidth=0.15)
+ax2.set_title('Edited nearest neighbours')
+
+# Apply the RENN
+print('RENN')
+renn = RepeatedEditedNearestNeighbours()
+X_resampled, y_resampled = renn.fit_transform(X, y)
+X_res_vis = pca.transform(X_resampled)
+
+ax3.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
+ label="Class #0", alpha=.5, edgecolor=almost_black,
+ facecolor=palette[0], linewidth=0.15)
+ax3.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
+ label="Class #1", alpha=.5, edgecolor=almost_black,
+ facecolor=palette[2], linewidth=0.15)
+ax3.set_title('Repeated Edited nearest neighbours')
+
+plt.show()
diff --git a/unbalanced_dataset/under_sampling/__init__.py b/unbalanced_dataset/under_sampling/__init__.py
@@ -12,6 +12,7 @@
 from .one_sided_selection import OneSidedSelection
 from .neighbourhood_cleaning_rule import NeighbourhoodCleaningRule
 from .edited_nearest_neighbours import EditedNearestNeighbours
+from .edited_nearest_neighbours import RepeatedEditedNearestNeighbours
 from .instance_hardness_threshold import InstanceHardnessThreshold
 
 __all__ = ['UnderSampler',
@@ -23,4 +24,5 @@
  'OneSidedSelection',
  'NeighbourhoodCleaningRule',
  'EditedNearestNeighbours',
+ 'RepeatedEditedNearestNeighbours',
  'InstanceHardnessThreshold']
diff --git a/unbalanced_dataset/under_sampling/edited_nearest_neighbours.py b/unbalanced_dataset/under_sampling/edited_nearest_neighbours.py
@@ -251,3 +251,219 @@ def transform(self, X, y):
  return X_resampled, y_resampled, idx_under
  else:
  return X_resampled, y_resampled
+
+
+class RepeatedEditedNearestNeighbours(UnderSampler):
+ """Class to perform under-sampling based on the repeated edited nearest 
+ neighbour method.
+
+ Parameters
+ ----------
+ return_indices : bool, optional (default=False)
+ Either to return or not the indices which will be selected from
+ the majority class.
+
+ random_state : int or None, optional (default=None)
+ Seed for random number generation.
+
+ verbose : bool, optional (default=True)
+ Boolean to either or not print information about the processing
+
+ size_ngh : int, optional (default=3)
+ Size of the neighbourhood to consider to compute the average
+ distance to the minority point samples.
+
+ kind_sel : str, optional (default='all')
+ Strategy to use in order to exclude samples.
+
+ - If 'all', all neighbours will have to agree with the samples of
+ interest to not be excluded.
+ - If 'mode', the majority vote of the neighbours will be used in
+ order to exclude a sample.
+
+ n_jobs : int, optional (default=-1)
+ The number of thread to open when it is possible.
+
+ Attributes
+ ----------
+ ratio_ : str or float, optional (default='auto')
+ If 'auto', the ratio will be defined automatically to balanced
+ the dataset. Otherwise, the ratio will corresponds to the number
+ of samples in the minority class over the the number of samples
+ in the majority class.
+
+ rs_ : int or None, optional (default=None)
+ Seed for random number generation.
+
+ min_c_ : str or int
+ The identifier of the minority class.
+
+ max_c_ : str or int
+ The identifier of the majority class.
+
+ stats_c_ : dict of str/int : int
+ A dictionary in which the number of occurences of each class is
+ reported.
+
+ max_iter : int, optional (default=100)
+ Maximum number of iterations of the edited nearest neighbours
+ algorithm for a single run.
+
+ Notes
+ -----
+ The method is based on [1]_.
+
+ This class supports multi-class.
+
+ References
+ ----------
+ .. [1] I. Tomek, “An Experiment with the Edited Nearest-Neighbor
+ Rule,” IEEE Trans. Systems, Man, and Cybernetics, vol. 6, no. 6,
+ pp. 448-452, June 1976.
+
+ """
+
+ def __init__(self, return_indices=False, random_state=None, verbose=True,
+ size_ngh=3, max_iter=100, kind_sel='all', n_jobs=-1):
+ """Initialisation of RENN object.
+
+ Parameters
+ ----------
+ return_indices : bool, optional (default=False)
+ Either to return or not the indices which will be selected from
+ the majority class.
+
+ random_state : int or None, optional (default=None)
+ Seed for random number generation.
+
+ verbose : bool, optional (default=True)
+ Boolean to either or not print information about the processing
+
+ size_ngh : int, optional (default=3)
+ Size of the neighbourhood to consider to compute the average
+ distance to the minority point samples.
+
+ max_iter : int, optional (default=100)
+ Maximum number of iterations of the edited nearest neighbours
+ algorithm for a single run.
+
+ kind_sel : str, optional (default='all')
+ Strategy to use in order to exclude samples.
+
+ - If 'all', all neighbours will have to agree with the samples of
+ interest to not be excluded.
+ - If 'mode', the majority vote of the neighbours will be used in
+ order to exclude a sample.
+
+ n_jobs : int, optional (default=-1)
+ The number of thread to open when it is possible.
+
+ Returns
+ -------
+ None
+
+ """
+ super(RepeatedEditedNearestNeighbours, self).__init__(
+ return_indices=return_indices,
+ random_state=random_state,
+ verbose=verbose)
+
+ self.size_ngh = size_ngh
+ possible_kind_sel = ('all', 'mode')
+ if kind_sel not in possible_kind_sel:
+ raise NotImplementedError
+ else:
+ self.kind_sel = kind_sel
+ self.n_jobs = n_jobs
+
+ if max_iter < 2:
+ raise ValueError('max_iter must be greater than 1.')
+ else:
+ self.max_iter = max_iter
+
+ self.enn_ = EditedNearestNeighbours(
+ return_indices=return_indices,
+ random_state=random_state, verbose=False,
+ size_ngh=size_ngh, kind_sel=kind_sel,
+ n_jobs=n_jobs)
+
+ def fit(self, X, y):
+ """Find the classes statistics before to perform sampling.
+
+ Parameters
+ ----------
+ X : ndarray, shape (n_samples, n_features)
+ Matrix containing the data which have to be sampled.
+
+ y : ndarray, shape (n_samples, )
+ Corresponding label for each sample in X.
+
+ Returns
+ -------
+ self : object,
+ Return self.
+
+ """
+ # Check the consistency of X and y
+ X, y = check_X_y(X, y)
+
+ super(RepeatedEditedNearestNeighbours, self).fit(X, y)
+ self.enn_.fit(X, y)
+
+ return self
+
+ def transform(self, X, y):
+ """Resample the dataset.
+
+ Parameters
+ ----------
+ X : ndarray, shape (n_samples, n_features)
+ Matrix containing the data which have to be sampled.
+
+ y : ndarray, shape (n_samples, )
+ Corresponding label for each sample in X.
+
+ Returns
+ -------
+ X_resampled : ndarray, shape (n_samples_new, n_features)
+ The array containing the resampled data.
+
+ y_resampled : ndarray, shape (n_samples_new)
+ The corresponding label of `X_resampled`
+
+ idx_under : ndarray, shape (n_samples, )
+ If `return_indices` is `True`, a boolean array will be returned
+ containing the which samples have been selected.
+
+ """
+ # Check the consistency of X and y
+ X, y = check_X_y(X, y)
+ X_, y_ = X.copy(), y.copy()
+
+ if self.return_indices:
+ idx_under = np.arange(X.shape[0], dtype=int)
+
+ prev_len = y.shape[0]
+
+ for n_iter in range(self.max_iter):
+ prev_len = y_.shape[0]
+ if self.return_indices:
+ X_, y_, idx_ = self.enn_.transform(X_, y_)
+ idx_under = idx_under[idx_]
+ else:
+ X_, y_ = self.enn_.transform(X_, y_)
+
+ if prev_len == y_.shape[0]:
+ break
+
+ if self.verbose:
+ print("Under-sampling performed: {}".format(Counter(y_)))
+
+ X_resampled, y_resampled = X_, y_
+
+ # Check if the indices of the samples selected should be returned too
+ if self.return_indices:
+ # Return the indices of interest
+ return X_resampled, y_resampled, idx_under
+ else:
+ return X_resampled, y_resampled