scikit-learn-contrib · glemaitre · May 30, 2015 · May 30, 2015 · May 31, 2015 · Jun 10, 2015
diff --git a/README.md b/README.md
@@ -1,10 +1,20 @@
+UnbalancedDataset is a python package offering a number of re-sampling techniques commonly used in datasets showing strong between-class imbalance.
+
 [![Code Health](https://landscape.io/github/fmfn/UnbalancedDataset/master/landscape.svg?style=flat)](https://landscape.io/github/fmfn/UnbalancedDataset/master)
 
+Installation
+============
+
+UnbalancedDataset is not currently available on PyPi. To install the package, you will need to clone it and run the
+setup.py file. Use the following commands to get a copy from Github and install all dependencies:
+
+ git clone https://github.com/fmfn/UnbalancedDataset.git
+ cd UnbalancedDataset
+ python setup.py install
+
 UnbalancedDataset
 =================
 
-UnbalancedDataset is a python module offering a number of re-sampling techniques commonly used in datasets showing strong between-class imbalance.
-
 Most classification algorithms will only perform optimally when the number of samples of each class is roughly the same. Highly skewed datasets, where the minority is heavily outnumbered by one or more classes, have proven to be a challenge while at the same time becoming more and more common.
 
 One way of addresing this issue is by re-sampling the dataset as to offset this imbalance with the hope of arriving at a more robust and fair decision boundary than you would otherwise.
@@ -30,7 +40,7 @@ Bellow is a list of the methods currently implemented in this module.
  3. bSMOTE(1&2) - Borderline SMOTE of types 1 and 2
  4. SVM_SMOTE - Support Vectors SMOTE
 
-* Over-sampling follow by under-sampling
+* Over-sampling followed by under-sampling
  1. SMOTE + Tomek links
  2. SMOTE + ENN
 
@@ -44,8 +54,8 @@ Example:
 This is a work in progress. Any comments, suggestions or corrections are welcome.
 
 Dependencies:
-* Numpy
-* Scikit-Learn
+* numpy
+* scikit-learn
 
 References:
 

diff --git a/test/Notebook_UnbalancedDataset.ipynb → notebook/Notebook_UnbalancedDataset.ipynb b/test/Notebook_UnbalancedDataset.ipynb → notebook/Notebook_UnbalancedDataset.ipynb
@@ -15,11 +15,23 @@
  },
  {
  "cell_type": "code",
- "execution_count": 1,
+ "execution_count": 5,
  "metadata": {
  "collapsed": false
  },
- "outputs": [],
+ "outputs": [
+ {
+ "ename": "ValueError",
+ "evalue": "Attempted relative import in non-package",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
+ "\u001b[1;32m<ipython-input-5-fb71cb5ef470>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdecomposition\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mPCA\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 13\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 14\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0munbalanced_dataset\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mUnbalancedDataset\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 15\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0munbalanced_dataset\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mUnderSampler\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mNearMiss\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mCondensedNearestNeighbour\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mOneSidedSelection\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0munbalanced_dataset\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mTomekLinks\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mClusterCentroids\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mOverSampler\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mSMOTE\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mSMOTETomek\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;31mValueError\u001b[0m: Attempted relative import in non-package"
+ ]
+ }
+ ],
  "source": [
  "%matplotlib inline\n",
  "\n",
@@ -34,9 +46,10 @@
  "from sklearn.datasets import make_classification\n",
  "from sklearn.decomposition import PCA\n",
  "\n",
- "from UnbalancedDataset import UnderSampler, NearMiss, CondensedNearestNeighbour, OneSidedSelection,\\\n",
- "NeighboorhoodCleaningRule, TomekLinks, ClusterCentroids, OverSampler, SMOTE, bSMOTE1, bSMOTE2, SVM_SMOTE,\\\n",
- "SMOTETomek, SMOTEENN, EasyEnsemble, BalanceCascade\n",
+ "from ..unbalanced_dataset import UnbalancedDataset\n",
+ "from ..unbalanced_dataset import UnderSampler, NearMiss, CondensedNearestNeighbour, OneSidedSelection\n",
+ "from ..unbalanced_dataset import TomekLinks, ClusterCentroids, OverSampler, SMOTE, SMOTETomek\n",
+ "from ..unbalanced_dataset import EasyEnsemble, BalanceCascade, NeighbourhoodCleaningRule, SMOTEENN\n",
  "\n",
  "# Save a nice dark grey as a variable\n",
  "almost_black = '#262626'"

diff --git a/setup.py b/setup.py
@@ -0,0 +1,25 @@
+from setuptools import setup, find_packages
+
+install_requires = [
+ 'numpy',
+ 'scipy',
+ 'scikit-learn',
+ ]
+
+setup(name='UnbalancedDataset',
+ version='0.1',
+ description='Python module with numerous re-sampling strategies to deal '
+ 'with classification of data-sets with strong between class '
+ 'imbalance.',
+ classifiers=[
+ "Programming Language :: Python :: 2.7",
+ "Programming Language :: Python :: 3.4",
+ ],
+ author='Fernando Nogueira, Guillaume Lemaitre',
+ author_email='fmfnogueira@gmail.com, guillaume.lemaitre@udg.edu',
+ url='https://github.com/fmfn/UnbalancedDataset',
+ packages=find_packages(),
+ include_package_data=True,
+ zip_safe=False,
+ install_requires=install_requires,
+ )
diff --git a/test/Visualization.py b/test/Visualization.py
diff --git a/test/test.py b/test/test.py
@@ -3,9 +3,24 @@
 
 from sklearn.datasets import make_classification
 
-from unbalanced_dataset import UnderSampler, NearMiss, CondensedNearestNeighbour, OneSidedSelection
-from unbalanced_dataset import TomekLinks, ClusterCentroids, OverSampler, SMOTE, SMOTETomek
-from unbalanced_dataset import EasyEnsemble, BalanceCascade, NeighbourhoodCleaningRule, SMOTEENN
+from unbalanced_dataset.unbalanced_dataset import UnbalancedDataset
+
+from unbalanced_dataset.over_sampling import OverSampler
+from unbalanced_dataset.over_sampling import SMOTE
+
+from unbalanced_dataset.under_sampling import UnderSampler
+from unbalanced_dataset.under_sampling import TomekLinks
+from unbalanced_dataset.under_sampling import ClusterCentroids
+from unbalanced_dataset.under_sampling import NearMiss
+from unbalanced_dataset.under_sampling import CondensedNearestNeighbour
+from unbalanced_dataset.under_sampling import OneSidedSelection
+from unbalanced_dataset.under_sampling import NeighbourhoodCleaningRule
+
+from unbalanced_dataset.ensemble_sampling import EasyEnsemble
+from unbalanced_dataset.ensemble_sampling import BalanceCascade
+
+from unbalanced_dataset.pipeline import SMOTEENN
+from unbalanced_dataset.pipeline import SMOTETomek
 
 # Generate some data
 print('Generate samples using scikit-learn')

diff --git a/unbalanced_dataset/ensemble_sampling.py b/unbalanced_dataset/ensemble_sampling.py
@@ -207,6 +207,8 @@ def resample(self):
 
  # Find the misclassified index to keep them for the next round
  idx_mis_class = idx_sel_from_maj[np.nonzero(pred_label != N_y[idx_sel_from_maj])]
+ if self.verbose:
+ print("Elements misclassified: ", idx_mis_class)
  # Count how many random element will be selected
  n_elt_maj = self.ucd[self.minc] - idx_mis_class.size
 
@@ -218,8 +220,24 @@ def resample(self):
 
  # Check if we have to make an early stopping
  if self.n_max_subset is not None:
- if self.n_max_subset >= n_subsets:
+ if n_subsets == (self.n_max_subset - 1):
  b_subset_search = False
+ # Select the remaining data
+ idx_sel_from_maj = np.nonzero(b_sel_N)[0]
+ idx_sel_from_maj = np.concatenate((idx_mis_class,
+ idx_sel_from_maj),
+ axis=0).astype(int)
+ # Select the final batch
+ x_data = np.concatenate((min_x, N_x[idx_sel_from_maj, :]), axis=0)
+ y_data = np.concatenate((min_y, N_y[idx_sel_from_maj]), axis=0)
+ # Push these data into a new subset
+ subsets_x.append(x_data)
+ subsets_y.append(y_data)
+ if self.verbose:
+ print("Creation of the subset #" + str(n_subsets))
+
+ # We found a new subset, increase the counter
+ n_subsets += 1
  if self.verbose:
  print('The number of subset achieved their maximum')
 

diff --git a/unbalanced_dataset/over_sampling.py b/unbalanced_dataset/over_sampling.py
@@ -419,34 +419,46 @@ def resample(self):
  fractions = betavariate(alpha=10, beta=10)
 
  # Interpolate samples in danger
- nns = self.nearest_neighbour_.kneighbors(support_vector[danger_bool],
- return_distance=False)[:, 1:]
+ if (np.count_nonzero(danger_bool) > 0):
+ nns = self.nearest_neighbour_.kneighbors(support_vector[danger_bool],
+ return_distance=False)[:, 1:]
 
- sx1, sy1 = self.make_samples(support_vector[danger_bool],
- minx,
- self.minc, nns,
- fractions * (int(self.ratio * len(minx)) + 1),
- step_size=1,
- random_state=self.rs,
- verbose=self.verbose)
+  sx1, sy1 = self.make_samples(support_vector[danger_bool],
+  minx,
+  self.minc, nns,
+  fractions * (int(self.ratio * len(minx)) + 1),
+  step_size=1,
+  random_state=self.rs,
+  verbose=self.verbose)
 
  # Extrapolate safe samples
- nns = self.nearest_neighbour_.kneighbors(support_vector[safety_bool],
- return_distance=False)[:, 1:]
-
- sx2, sy2 = self.make_samples(support_vector[safety_bool],
- minx,
- self.minc, nns,
- (1 - fractions) * int(self.ratio * len(minx)),
- step_size=-self.out_step,
- random_state=self.rs,
- verbose=self.verbose)
+ if (np.count_nonzero(safety_bool) > 0):
+ nns = self.nearest_neighbour_.kneighbors(support_vector[safety_bool],
+ return_distance=False)[:, 1:]
+
+ sx2, sy2 = self.make_samples(support_vector[safety_bool],
+ minx,
+ self.minc, nns,
+ (1 - fractions) * int(self.ratio * len(minx)),
+ step_size=-self.out_step,
+ random_state=self.rs,
+ verbose=self.verbose)
 
  if self.verbose:
  print("done!")
 
  # Concatenate the newly generated samples to the original data set
- ret_x = concatenate((self.x, sx1, sx2), axis=0)
- ret_y = concatenate((self.y, sy1, sy2), axis=0)
+ if ( (np.count_nonzero(danger_bool) > 0) and
+ (np.count_nonzero(safety_bool) > 0) ):
+ ret_x = concatenate((self.x, sx1, sx2), axis=0)
+ ret_y = concatenate((self.y, sy1, sy2), axis=0)
+ # not any support vectors in danger
+ elif np.count_nonzero(danger_bool) == 0:
+ ret_x = concatenate((self.x, sx2), axis=0)
+ ret_y = concatenate((self.y, sy2), axis=0)
+ # All the support vector in danger
+ elif np.count_nonzero(safety_bool) == 0:
+ ret_x = concatenate((self.x, sx1), axis=0)
+ ret_y = concatenate((self.y, sy1), axis=0)
 
  return ret_x, ret_y