Skip to content
20 changes: 15 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,20 @@
UnbalancedDataset is a python package offering a number of re-sampling techniques commonly used in datasets showing strong between-class imbalance.

[![Code Health](https://landscape.io/github/fmfn/UnbalancedDataset/master/landscape.svg?style=flat)](https://landscape.io/github/fmfn/UnbalancedDataset/master)

Installation
============

UnbalancedDataset is not currently available on PyPi. To install the package, you will need to clone it and run the
setup.py file. Use the following commands to get a copy from Github and install all dependencies:

git clone https://github.com/fmfn/UnbalancedDataset.git
cd UnbalancedDataset
python setup.py install

UnbalancedDataset
=================

UnbalancedDataset is a python module offering a number of re-sampling techniques commonly used in datasets showing strong between-class imbalance.

Most classification algorithms will only perform optimally when the number of samples of each class is roughly the same. Highly skewed datasets, where the minority is heavily outnumbered by one or more classes, have proven to be a challenge while at the same time becoming more and more common.

One way of addresing this issue is by re-sampling the dataset as to offset this imbalance with the hope of arriving at a more robust and fair decision boundary than you would otherwise.
Expand All @@ -30,7 +40,7 @@ Bellow is a list of the methods currently implemented in this module.
3. bSMOTE(1&2) - Borderline SMOTE of types 1 and 2
4. SVM_SMOTE - Support Vectors SMOTE

* Over-sampling follow by under-sampling
* Over-sampling followed by under-sampling
1. SMOTE + Tomek links
2. SMOTE + ENN

Expand All @@ -44,8 +54,8 @@ Example:
This is a work in progress. Any comments, suggestions or corrections are welcome.

Dependencies:
* Numpy
* Scikit-Learn
* numpy
* scikit-learn

References:

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,23 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"ename": "ValueError",
"evalue": "Attempted relative import in non-package",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-5-fb71cb5ef470>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdecomposition\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mPCA\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 13\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 14\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0munbalanced_dataset\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mUnbalancedDataset\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 15\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0munbalanced_dataset\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mUnderSampler\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mNearMiss\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mCondensedNearestNeighbour\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mOneSidedSelection\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0munbalanced_dataset\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mTomekLinks\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mClusterCentroids\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mOverSampler\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mSMOTE\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mSMOTETomek\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mValueError\u001b[0m: Attempted relative import in non-package"
]
}
],
"source": [
"%matplotlib inline\n",
"\n",
Expand All @@ -34,9 +46,10 @@
"from sklearn.datasets import make_classification\n",
"from sklearn.decomposition import PCA\n",
"\n",
"from UnbalancedDataset import UnderSampler, NearMiss, CondensedNearestNeighbour, OneSidedSelection,\\\n",
"NeighboorhoodCleaningRule, TomekLinks, ClusterCentroids, OverSampler, SMOTE, bSMOTE1, bSMOTE2, SVM_SMOTE,\\\n",
"SMOTETomek, SMOTEENN, EasyEnsemble, BalanceCascade\n",
"from ..unbalanced_dataset import UnbalancedDataset\n",
"from ..unbalanced_dataset import UnderSampler, NearMiss, CondensedNearestNeighbour, OneSidedSelection\n",
"from ..unbalanced_dataset import TomekLinks, ClusterCentroids, OverSampler, SMOTE, SMOTETomek\n",
"from ..unbalanced_dataset import EasyEnsemble, BalanceCascade, NeighbourhoodCleaningRule, SMOTEENN\n",
"\n",
"# Save a nice dark grey as a variable\n",
"almost_black = '#262626'"
Expand Down
25 changes: 25 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from setuptools import setup, find_packages

install_requires = [
'numpy',
'scipy',
'scikit-learn',
]

setup(name='UnbalancedDataset',
version='0.1',
description='Python module with numerous re-sampling strategies to deal '
'with classification of data-sets with strong between class '
'imbalance.',
classifiers=[
"Programming Language :: Python :: 2.7",
"Programming Language :: Python :: 3.4",
],
author='Fernando Nogueira, Guillaume Lemaitre',
author_email='fmfnogueira@gmail.com, guillaume.lemaitre@udg.edu',
url='https://github.com/fmfn/UnbalancedDataset',
packages=find_packages(),
include_package_data=True,
zip_safe=False,
install_requires=install_requires,
)
141 changes: 0 additions & 141 deletions test/Visualization.py

This file was deleted.

21 changes: 18 additions & 3 deletions test/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,24 @@

from sklearn.datasets import make_classification

from unbalanced_dataset import UnderSampler, NearMiss, CondensedNearestNeighbour, OneSidedSelection
from unbalanced_dataset import TomekLinks, ClusterCentroids, OverSampler, SMOTE, SMOTETomek
from unbalanced_dataset import EasyEnsemble, BalanceCascade, NeighbourhoodCleaningRule, SMOTEENN
from unbalanced_dataset.unbalanced_dataset import UnbalancedDataset

from unbalanced_dataset.over_sampling import OverSampler
from unbalanced_dataset.over_sampling import SMOTE

from unbalanced_dataset.under_sampling import UnderSampler
from unbalanced_dataset.under_sampling import TomekLinks
from unbalanced_dataset.under_sampling import ClusterCentroids
from unbalanced_dataset.under_sampling import NearMiss
from unbalanced_dataset.under_sampling import CondensedNearestNeighbour
from unbalanced_dataset.under_sampling import OneSidedSelection
from unbalanced_dataset.under_sampling import NeighbourhoodCleaningRule

from unbalanced_dataset.ensemble_sampling import EasyEnsemble
from unbalanced_dataset.ensemble_sampling import BalanceCascade

from unbalanced_dataset.pipeline import SMOTEENN
from unbalanced_dataset.pipeline import SMOTETomek

# Generate some data
print('Generate samples using scikit-learn')
Expand Down
20 changes: 19 additions & 1 deletion unbalanced_dataset/ensemble_sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,8 @@ def resample(self):

# Find the misclassified index to keep them for the next round
idx_mis_class = idx_sel_from_maj[np.nonzero(pred_label != N_y[idx_sel_from_maj])]
if self.verbose:
print("Elements misclassified: ", idx_mis_class)
# Count how many random element will be selected
n_elt_maj = self.ucd[self.minc] - idx_mis_class.size

Expand All @@ -218,8 +220,24 @@ def resample(self):

# Check if we have to make an early stopping
if self.n_max_subset is not None:
if self.n_max_subset >= n_subsets:
if n_subsets == (self.n_max_subset - 1):
b_subset_search = False
# Select the remaining data
idx_sel_from_maj = np.nonzero(b_sel_N)[0]
idx_sel_from_maj = np.concatenate((idx_mis_class,
idx_sel_from_maj),
axis=0).astype(int)
# Select the final batch
x_data = np.concatenate((min_x, N_x[idx_sel_from_maj, :]), axis=0)
y_data = np.concatenate((min_y, N_y[idx_sel_from_maj]), axis=0)
# Push these data into a new subset
subsets_x.append(x_data)
subsets_y.append(y_data)
if self.verbose:
print("Creation of the subset #" + str(n_subsets))

# We found a new subset, increase the counter
n_subsets += 1
if self.verbose:
print('The number of subset achieved their maximum')

Expand Down
54 changes: 33 additions & 21 deletions unbalanced_dataset/over_sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,34 +419,46 @@ def resample(self):
fractions = betavariate(alpha=10, beta=10)

# Interpolate samples in danger
nns = self.nearest_neighbour_.kneighbors(support_vector[danger_bool],
return_distance=False)[:, 1:]
if (np.count_nonzero(danger_bool) > 0):
nns = self.nearest_neighbour_.kneighbors(support_vector[danger_bool],
return_distance=False)[:, 1:]

sx1, sy1 = self.make_samples(support_vector[danger_bool],
minx,
self.minc, nns,
fractions * (int(self.ratio * len(minx)) + 1),
step_size=1,
random_state=self.rs,
verbose=self.verbose)
sx1, sy1 = self.make_samples(support_vector[danger_bool],
minx,
self.minc, nns,
fractions * (int(self.ratio * len(minx)) + 1),
step_size=1,
random_state=self.rs,
verbose=self.verbose)

# Extrapolate safe samples
nns = self.nearest_neighbour_.kneighbors(support_vector[safety_bool],
return_distance=False)[:, 1:]

sx2, sy2 = self.make_samples(support_vector[safety_bool],
minx,
self.minc, nns,
(1 - fractions) * int(self.ratio * len(minx)),
step_size=-self.out_step,
random_state=self.rs,
verbose=self.verbose)
if (np.count_nonzero(safety_bool) > 0):
nns = self.nearest_neighbour_.kneighbors(support_vector[safety_bool],
return_distance=False)[:, 1:]

sx2, sy2 = self.make_samples(support_vector[safety_bool],
minx,
self.minc, nns,
(1 - fractions) * int(self.ratio * len(minx)),
step_size=-self.out_step,
random_state=self.rs,
verbose=self.verbose)

if self.verbose:
print("done!")

# Concatenate the newly generated samples to the original data set
ret_x = concatenate((self.x, sx1, sx2), axis=0)
ret_y = concatenate((self.y, sy1, sy2), axis=0)
if ( (np.count_nonzero(danger_bool) > 0) and
(np.count_nonzero(safety_bool) > 0) ):
ret_x = concatenate((self.x, sx1, sx2), axis=0)
ret_y = concatenate((self.y, sy1, sy2), axis=0)
# not any support vectors in danger
elif np.count_nonzero(danger_bool) == 0:
ret_x = concatenate((self.x, sx2), axis=0)
ret_y = concatenate((self.y, sy2), axis=0)
# All the support vector in danger
elif np.count_nonzero(safety_bool) == 0:
ret_x = concatenate((self.x, sx1), axis=0)
ret_y = concatenate((self.y, sy1), axis=0)

return ret_x, ret_y
Loading