Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions examples/under-sampling/plot_repeated_edited_nearest_neighbours.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""
=========================
Repeated Edited nearest-neighbours
=========================

An illustration of the repeated edited nearest-neighbours method.

"""

print(__doc__)

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Define some color for the plotting
almost_black = '#262626'
palette = sns.color_palette()

from sklearn.datasets import make_classification
from sklearn.decomposition import PCA

from unbalanced_dataset.under_sampling import EditedNearestNeighbours, \
RepeatedEditedNearestNeighbours

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=1.25, weights=[0.3, 0.7],
n_informative=3, n_redundant=1, flip_y=0,
n_features=5, n_clusters_per_class=1,
n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Three subplots, unpack the axes array immediately
f, (ax1, ax2, ax3) = plt.subplots(1, 3)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=.5,
edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=.5,
edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

# Apply the ENN
print('ENN')
enn = EditedNearestNeighbours()
X_resampled, y_resampled = enn.fit_transform(X, y)
X_res_vis = pca.transform(X_resampled)

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
label="Class #0", alpha=.5, edgecolor=almost_black,
facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
label="Class #1", alpha=.5, edgecolor=almost_black,
facecolor=palette[2], linewidth=0.15)
ax2.set_title('Edited nearest neighbours')

# Apply the RENN
print('RENN')
renn = RepeatedEditedNearestNeighbours()
X_resampled, y_resampled = renn.fit_transform(X, y)
X_res_vis = pca.transform(X_resampled)

ax3.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
label="Class #0", alpha=.5, edgecolor=almost_black,
facecolor=palette[0], linewidth=0.15)
ax3.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
label="Class #1", alpha=.5, edgecolor=almost_black,
facecolor=palette[2], linewidth=0.15)
ax3.set_title('Repeated Edited nearest neighbours')

plt.show()
2 changes: 2 additions & 0 deletions unbalanced_dataset/under_sampling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from .one_sided_selection import OneSidedSelection
from .neighbourhood_cleaning_rule import NeighbourhoodCleaningRule
from .edited_nearest_neighbours import EditedNearestNeighbours
from .edited_nearest_neighbours import RepeatedEditedNearestNeighbours
from .instance_hardness_threshold import InstanceHardnessThreshold

__all__ = ['UnderSampler',
Expand All @@ -23,4 +24,5 @@
'OneSidedSelection',
'NeighbourhoodCleaningRule',
'EditedNearestNeighbours',
'RepeatedEditedNearestNeighbours',
'InstanceHardnessThreshold']
216 changes: 216 additions & 0 deletions unbalanced_dataset/under_sampling/edited_nearest_neighbours.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,3 +251,219 @@ def transform(self, X, y):
return X_resampled, y_resampled, idx_under
else:
return X_resampled, y_resampled


class RepeatedEditedNearestNeighbours(UnderSampler):
"""Class to perform under-sampling based on the repeated edited nearest
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You have a trailing space at the end of the line

neighbour method.

Parameters
----------
return_indices : bool, optional (default=False)
Either to return or not the indices which will be selected from
the majority class.

random_state : int or None, optional (default=None)
Seed for random number generation.

verbose : bool, optional (default=True)
Boolean to either or not print information about the processing

size_ngh : int, optional (default=3)
Size of the neighbourhood to consider to compute the average
distance to the minority point samples.

kind_sel : str, optional (default='all')
Strategy to use in order to exclude samples.

- If 'all', all neighbours will have to agree with the samples of
interest to not be excluded.
- If 'mode', the majority vote of the neighbours will be used in
order to exclude a sample.

n_jobs : int, optional (default=-1)
The number of thread to open when it is possible.

Attributes
----------
ratio_ : str or float, optional (default='auto')
If 'auto', the ratio will be defined automatically to balanced
the dataset. Otherwise, the ratio will corresponds to the number
of samples in the minority class over the the number of samples
in the majority class.

rs_ : int or None, optional (default=None)
Seed for random number generation.

min_c_ : str or int
The identifier of the minority class.

max_c_ : str or int
The identifier of the majority class.

stats_c_ : dict of str/int : int
A dictionary in which the number of occurences of each class is
reported.

max_iter : int, optional (default=100)
Maximum number of iterations of the edited nearest neighbours
algorithm for a single run.

Notes
-----
The method is based on [1]_.

This class supports multi-class.

References
----------
.. [1] I. Tomek, “An Experiment with the Edited Nearest-Neighbor
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is some non-ASCII characters apparently.

.. [1] I. Tomek, "An Experiment with the Edited Nearest-Neighbor Rule," IEEE Transactions on Systems, Man, and Cybernetics, vol. 6(6), pp. 448-452, June 1976. 
Rule,” IEEE Trans. Systems, Man, and Cybernetics, vol. 6, no. 6,
pp. 448-452, June 1976.

"""

def __init__(self, return_indices=False, random_state=None, verbose=True,
size_ngh=3, max_iter=100, kind_sel='all', n_jobs=-1):
"""Initialisation of RENN object.

Parameters
----------
return_indices : bool, optional (default=False)
Either to return or not the indices which will be selected from
the majority class.

random_state : int or None, optional (default=None)
Seed for random number generation.

verbose : bool, optional (default=True)
Boolean to either or not print information about the processing

size_ngh : int, optional (default=3)
Size of the neighbourhood to consider to compute the average
distance to the minority point samples.

max_iter : int, optional (default=100)
Maximum number of iterations of the edited nearest neighbours
algorithm for a single run.

kind_sel : str, optional (default='all')
Strategy to use in order to exclude samples.

- If 'all', all neighbours will have to agree with the samples of
interest to not be excluded.
- If 'mode', the majority vote of the neighbours will be used in
order to exclude a sample.

n_jobs : int, optional (default=-1)
The number of thread to open when it is possible.

Returns
-------
None

"""
super(RepeatedEditedNearestNeighbours, self).__init__(
return_indices=return_indices,
random_state=random_state,
verbose=verbose)

self.size_ngh = size_ngh
possible_kind_sel = ('all', 'mode')
if kind_sel not in possible_kind_sel:
raise NotImplementedError
else:
self.kind_sel = kind_sel
self.n_jobs = n_jobs

if max_iter < 2:
raise ValueError('max_iter must be greater than 1.')
else:
self.max_iter = max_iter

self.enn_ = EditedNearestNeighbours(
return_indices=return_indices,
random_state=random_state, verbose=False,
size_ngh=size_ngh, kind_sel=kind_sel,
n_jobs=n_jobs)

def fit(self, X, y):
"""Find the classes statistics before to perform sampling.

Parameters
----------
X : ndarray, shape (n_samples, n_features)
Matrix containing the data which have to be sampled.

y : ndarray, shape (n_samples, )
Corresponding label for each sample in X.

Returns
-------
self : object,
Return self.

"""
# Check the consistency of X and y
X, y = check_X_y(X, y)

super(RepeatedEditedNearestNeighbours, self).fit(X, y)
self.enn_.fit(X, y)

return self

def transform(self, X, y):
"""Resample the dataset.

Parameters
----------
X : ndarray, shape (n_samples, n_features)
Matrix containing the data which have to be sampled.

y : ndarray, shape (n_samples, )
Corresponding label for each sample in X.

Returns
-------
X_resampled : ndarray, shape (n_samples_new, n_features)
The array containing the resampled data.

y_resampled : ndarray, shape (n_samples_new)
The corresponding label of `X_resampled`

idx_under : ndarray, shape (n_samples, )
If `return_indices` is `True`, a boolean array will be returned
containing the which samples have been selected.

"""
# Check the consistency of X and y
X, y = check_X_y(X, y)
X_, y_ = X.copy(), y.copy()

if self.return_indices:
idx_under = np.arange(X.shape[0], dtype=int)

prev_len = y.shape[0]

for n_iter in range(self.max_iter):
prev_len = y_.shape[0]
if self.return_indices:
X_, y_, idx_ = self.enn_.transform(X_, y_)
idx_under = idx_under[idx_]
else:
X_, y_ = self.enn_.transform(X_, y_)

if prev_len == y_.shape[0]:
break

if self.verbose:
print("Under-sampling performed: {}".format(Counter(y_)))

X_resampled, y_resampled = X_, y_

# Check if the indices of the samples selected should be returned too
if self.return_indices:
# Return the indices of interest
return X_resampled, y_resampled, idx_under
else:
return X_resampled, y_resampled