- Notifications
You must be signed in to change notification settings - Fork 1.3k
[MRG] EHN add collections of imbalanced datasets #249
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
f8a2309 ac13c51 0381b7d b29fc34 8c27e75 ddbed2d 2d5a7d1 47f6b9b 11ea904 2fd4295 6b95144 0bd1522 a3f7f8d ea96823 ad23b4d 5f79302 82bc13a 28e7830 File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -158,7 +158,7 @@ Functions | |
| :toctree: generated/ | ||
| | ||
| datasets.make_imbalance | ||
| | ||
| datasets.fetch_zenodo | ||
| | ||
| Utilities | ||
| ========= | ||
| | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -14,6 +14,8 @@ New features | |
| | ||
| - Turn off steps in :class:`pipeline.Pipeline` using the `None` | ||
| object. By `Christos Aridas`_. | ||
| - Add a fetching method `datasets.fetch_zenodo` in order to get some | ||
| ||
| imbalanced datasets useful for benchmarking. By `Guillaume Lemaitre`_. | ||
| | ||
| Enhancement | ||
| ~~~~~~~~~~~ | ||
| | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -4,5 +4,7 @@ | |
| """ | ||
| | ||
| from .imbalance import make_imbalance | ||
| from .zenodo import fetch_zenodo | ||
| | ||
| __all__ = ['make_imbalance'] | ||
| __all__ = ['make_imbalance', | ||
| Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sort them if you like Member Author There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Uhm not sure since that they are not in the same file. | ||
| 'fetch_zenodo'] | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,92 @@ | ||
| """Test the zenodo loader. | ||
| | ||
| Skipped if zenodo is not already downloaded to data_home. | ||
| """ | ||
| from imblearn.datasets import fetch_zenodo | ||
| from sklearn.utils.testing import (assert_equal, assert_allclose, | ||
| assert_raises_regex, SkipTest) | ||
| | ||
| DATASET_SHAPE = {'ecoli': (336, 7), | ||
| 'optical_digits': (5620, 64), | ||
| 'satimage': (6435, 36), | ||
| 'pen_digits': (10992, 16), | ||
| 'abalone': (4177, 10), | ||
| 'sick_euthyroid': (3163, 42), | ||
| 'spectrometer': (531, 93), | ||
| 'car_eval_34': (1728, 21), | ||
| 'isolet': (7797, 617), | ||
| 'us_crime': (1994, 100), | ||
| 'yeast_ml8': (2417, 103), | ||
| 'scene': (2407, 294), | ||
| 'libras_move': (360, 90), | ||
| 'thyroid_sick': (3772, 52), | ||
| 'coil_2000': (9822, 85), | ||
| 'arrhythmia': (452, 278), | ||
| 'solar_flare_m0': (1389, 32), | ||
| 'oil': (937, 49), | ||
| 'car_eval_4': (1728, 21), | ||
| 'wine_quality': (4898, 11), | ||
| 'letter_img': (20000, 16), | ||
| 'yeast_me2': (1484, 8), | ||
| 'webpage': (34780, 300), | ||
| 'ozone_level': (2536, 72), | ||
| 'mammography': (11183, 6), | ||
| 'protein_homo': (145751, 74), | ||
| 'abalone_19': (4177, 10)} | ||
| | ||
| | ||
| def fetch(*args, **kwargs): | ||
| return fetch_zenodo(*args, download_if_missing=True, **kwargs) | ||
| | ||
| | ||
| def test_fetch(): | ||
| try: | ||
| datasets1 = fetch(shuffle=True, random_state=42) | ||
| except IOError: | ||
| raise SkipTest("Zenodo dataset can not be loaded.") | ||
| | ||
| datasets2 = fetch(shuffle=True, random_state=37) | ||
| | ||
| for k in DATASET_SHAPE.keys(): | ||
| | ||
| X1, X2 = datasets1[k].data, datasets2[k].data | ||
| assert_equal(DATASET_SHAPE[k], X1.shape) | ||
| assert_equal(X1.shape, X2.shape) | ||
| | ||
| assert_allclose(X1.sum(), X2.sum()) | ||
| | ||
| y1, y2 = datasets1[k].target, datasets2[k].target | ||
| assert_equal((X1.shape[0],), y1.shape) | ||
| assert_equal((X1.shape[0],), y2.shape) | ||
| | ||
| | ||
| def test_fetch_filter(): | ||
| try: | ||
| datasets1 = fetch(filter_data=tuple([1]), shuffle=True, | ||
| random_state=42) | ||
| except IOError: | ||
| raise SkipTest("Zenodo dataset can not be loaded.") | ||
| | ||
| datasets2 = fetch(filter_data=tuple(['ecoli']), shuffle=True, | ||
| random_state=37) | ||
| | ||
| X1, X2 = datasets1['ecoli'].data, datasets2['ecoli'].data | ||
| assert_equal(DATASET_SHAPE['ecoli'], X1.shape) | ||
| assert_equal(X1.shape, X2.shape) | ||
| | ||
| assert_allclose(X1.sum(), X2.sum()) | ||
| | ||
| y1, y2 = datasets1['ecoli'].target, datasets2['ecoli'].target | ||
| assert_equal((X1.shape[0],), y1.shape) | ||
| assert_equal((X1.shape[0],), y2.shape) | ||
| | ||
| | ||
| def test_fetch_error(): | ||
| assert_raises_regex(ValueError, 'is not a dataset available.', | ||
| fetch_zenodo, filter_data=tuple(['rnd'])) | ||
| assert_raises_regex(ValueError, 'dataset with the ID=', | ||
| fetch_zenodo, filter_data=tuple([-1])) | ||
| assert_raises_regex(ValueError, 'dataset with the ID=', | ||
| fetch_zenodo, filter_data=tuple([100])) | ||
| assert_raises_regex(ValueError, 'value in the tuple', | ||
| fetch_zenodo, filter_data=tuple([1.00])) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not just
fetch_datasets?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wanted something similar to
fetch_mldata