Skip to content

Commit e530a18

Browse files
authored
Merge pull request #44 from rasbt/nestedcv
update nested cv code with sklearn 0.18 syntax
2 parents fc0d326 + 128edcc commit e530a18

File tree

1 file changed

+243
-47
lines changed

1 file changed

+243
-47
lines changed

data_viz/model-evaluation-articles/nested_cv_code.ipynb

Lines changed: 243 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,19 @@
11
{
22
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"**This Jupyter notebook contains the complimentary code for the Appendix section of the article \"Model evaluation, model selection, and algorithm selection in machine learning - Part IV\" at http://sebastianraschka.com/blog/2016/model-evaluation-selection-part4.html.**\n"
8+
]
9+
},
10+
{
11+
"cell_type": "markdown",
12+
"metadata": {},
13+
"source": [
14+
"# A \"nested cross-validation for algorithm selection\" example using scikit-learn"
15+
]
16+
},
317
{
418
"cell_type": "code",
519
"execution_count": 1,
@@ -11,7 +25,7 @@
1125
"name": "stdout",
1226
"output_type": "stream",
1327
"text": [
14-
"Sebastian Raschka 2016-09-04 \n",
28+
"Sebastian Raschka 2016-09-30 \n",
1529
"\n",
1630
"CPython 3.5.2\n",
1731
"IPython 5.1.0\n",
@@ -26,20 +40,6 @@
2640
"%watermark -a 'Sebastian Raschka' -d -p sklearn,mlxtend -v"
2741
]
2842
},
29-
{
30-
"cell_type": "markdown",
31-
"metadata": {},
32-
"source": [
33-
"**This Jupyter notebook contains the code to create the data visualizations for the article \"Model evaluation, model selection, and algorithm selection in machine learning - Part IV\" at http://sebastianraschka.com/blog/2016/model-evaluation-selection-part4.html.**\n"
34-
]
35-
},
36-
{
37-
"cell_type": "markdown",
38-
"metadata": {},
39-
"source": [
40-
"# A Nested cross-validation example using scikit-learn"
41-
]
42-
},
4343
{
4444
"cell_type": "code",
4545
"execution_count": 2,
@@ -75,16 +75,16 @@
7575
"X = X.astype(np.float32)\n",
7676
"X_train, X_test, y_train, y_test = train_test_split(X, y,\n",
7777
" train_size=0.8,\n",
78-
" random_state=123,\n",
78+
" random_state=1,\n",
7979
" stratify=y)\n",
8080
"\n",
8181
"# Initializing Classifiers\n",
8282
"clf1 = LogisticRegression(multi_class='multinomial',\n",
8383
" solver='newton-cg',\n",
84-
" random_state=123)\n",
84+
" random_state=1)\n",
8585
"clf2 = KNeighborsClassifier(algorithm='ball_tree',\n",
8686
" leaf_size=50)\n",
87-
"clf3 = DecisionTreeClassifier(random_state=123)\n",
87+
"clf3 = DecisionTreeClassifier(random_state=1)\n",
8888
"clf4 = SVC(random_state=1)\n",
8989
"\n",
9090
"# Building the pipelines\n",
@@ -143,33 +143,33 @@
143143
"name": "stdout",
144144
"output_type": "stream",
145145
"text": [
146-
"outer fold 1/5 | tuning DTree | inner ACC 72.59% | outer ACC 76.00%\n",
147-
"outer fold 1/5 | tuning KNN | inner ACC 88.06% | outer ACC 90.75%\n",
148-
"outer fold 1/5 | tuning SVM | inner ACC 90.12% | outer ACC 91.50%\n",
149-
"outer fold 1/5 | tuning Softmax | inner ACC 87.88% | outer ACC 89.00%\n",
150-
"outer fold 2/5 | tuning DTree | inner ACC 72.88% | outer ACC 77.12%\n",
151-
"outer fold 2/5 | tuning KNN | inner ACC 88.28% | outer ACC 91.88%\n",
152-
"outer fold 2/5 | tuning SVM | inner ACC 90.06% | outer ACC 91.62%\n",
153-
"outer fold 2/5 | tuning Softmax | inner ACC 87.81% | outer ACC 90.62%\n",
154-
"outer fold 3/5 | tuning DTree | inner ACC 74.16% | outer ACC 78.38%\n",
155-
"outer fold 3/5 | tuning KNN | inner ACC 87.88% | outer ACC 90.38%\n",
156-
"outer fold 3/5 | tuning SVM | inner ACC 89.75% | outer ACC 92.25%\n",
157-
"outer fold 3/5 | tuning Softmax | inner ACC 87.78% | outer ACC 89.88%\n",
158-
"outer fold 4/5 | tuning DTree | inner ACC 74.47% | outer ACC 75.88%\n",
159-
"outer fold 4/5 | tuning KNN | inner ACC 88.44% | outer ACC 90.38%\n",
160-
"outer fold 4/5 | tuning SVM | inner ACC 90.41% | outer ACC 92.00%\n",
161-
"outer fold 4/5 | tuning Softmax | inner ACC 88.59% | outer ACC 89.38%\n",
162-
"outer fold 5/5 | tuning DTree | inner ACC 72.59% | outer ACC 70.62%\n",
163-
"outer fold 5/5 | tuning KNN | inner ACC 88.69% | outer ACC 90.50%\n",
164-
"outer fold 5/5 | tuning SVM | inner ACC 90.41% | outer ACC 90.50%\n",
165-
"outer fold 5/5 | tuning Softmax | inner ACC 88.16% | outer ACC 89.50%\n"
146+
"outer fold 1/5 | tuning DTree | inner ACC 72.38% | outer ACC 81.25%\n",
147+
"outer fold 1/5 | tuning KNN | inner ACC 88.19% | outer ACC 90.62%\n",
148+
"outer fold 1/5 | tuning SVM | inner ACC 89.88% | outer ACC 92.62%\n",
149+
"outer fold 1/5 | tuning Softmax | inner ACC 88.22% | outer ACC 91.88%\n",
150+
"outer fold 2/5 | tuning DTree | inner ACC 75.16% | outer ACC 76.25%\n",
151+
"outer fold 2/5 | tuning KNN | inner ACC 88.62% | outer ACC 90.62%\n",
152+
"outer fold 2/5 | tuning SVM | inner ACC 90.84% | outer ACC 91.25%\n",
153+
"outer fold 2/5 | tuning Softmax | inner ACC 89.00% | outer ACC 90.62%\n",
154+
"outer fold 3/5 | tuning DTree | inner ACC 74.25% | outer ACC 78.75%\n",
155+
"outer fold 3/5 | tuning KNN | inner ACC 87.81% | outer ACC 93.00%\n",
156+
"outer fold 3/5 | tuning SVM | inner ACC 89.69% | outer ACC 92.12%\n",
157+
"outer fold 3/5 | tuning Softmax | inner ACC 89.03% | outer ACC 90.38%\n",
158+
"outer fold 4/5 | tuning DTree | inner ACC 75.03% | outer ACC 73.62%\n",
159+
"outer fold 4/5 | tuning KNN | inner ACC 88.88% | outer ACC 90.50%\n",
160+
"outer fold 4/5 | tuning SVM | inner ACC 90.78% | outer ACC 90.38%\n",
161+
"outer fold 4/5 | tuning Softmax | inner ACC 89.25% | outer ACC 86.50%\n",
162+
"outer fold 5/5 | tuning DTree | inner ACC 73.31% | outer ACC 76.25%\n",
163+
"outer fold 5/5 | tuning KNN | inner ACC 88.41% | outer ACC 90.88%\n",
164+
"outer fold 5/5 | tuning SVM | inner ACC 90.28% | outer ACC 93.00%\n",
165+
"outer fold 5/5 | tuning Softmax | inner ACC 88.16% | outer ACC 90.62%\n"
166166
]
167167
}
168168
],
169169
"source": [
170170
"cv_scores = {name: [] for name, gs_est in gridcvs.items()}\n",
171171
"\n",
172-
"skfold = StratifiedKFold(y=y_train, n_folds=5, shuffle=True, random_state=123)\n",
172+
"skfold = StratifiedKFold(y=y_train, n_folds=5, shuffle=True, random_state=1)\n",
173173
"\n",
174174
"# The outer loop for algorithm selection\n",
175175
"c = 1\n",
@@ -199,12 +199,12 @@
199199
"name": "stdout",
200200
"output_type": "stream",
201201
"text": [
202-
"KNN | outer CV acc. 90.78% +\\- 0.567\n",
203-
"DTree | outer CV acc. 75.60% +\\- 2.646\n",
204-
"SVM | outer CV acc. 91.58% +\\- 0.600\n",
205-
"Softmax | outer CV acc. 89.68% +\\- 0.551\n",
202+
"DTree | outer CV acc. 77.22% +\\- 2.584\n",
203+
"KNN | outer CV acc. 91.13% +\\- 0.945\n",
204+
"Softmax | outer CV acc. 90.00% +\\- 1.827\n",
205+
"SVM | outer CV acc. 91.88% +\\- 0.952\n",
206206
"\n",
207-
"SVM Best parameters {'clf4__kernel': 'rbf', 'clf4__gamma': 1.0000000000000001e-05, 'clf4__C': 100.0}\n"
207+
"SVM Best parameters {'clf4__gamma': 0.001, 'clf4__C': 10.0, 'clf4__kernel': 'rbf'}\n"
208208
]
209209
}
210210
],
@@ -228,9 +228,9 @@
228228
"output_type": "stream",
229229
"text": [
230230
"Accuracy 90.80% (average over CV test folds)\n",
231-
"Best Parameters: {'clf4__kernel': 'rbf', 'clf4__gamma': 1.0000000000000001e-05, 'clf4__C': 100.0}\n",
232-
"Training Accuracy: 96.10%\n",
233-
"Test Accuracy: 92.70%\n"
231+
"Best Parameters: {'clf4__gamma': 0.001, 'clf4__C': 10.0, 'clf4__kernel': 'rbf'}\n",
232+
"Training Accuracy: 99.92%\n",
233+
"Test Accuracy: 93.00%\n"
234234
]
235235
}
236236
],
@@ -263,9 +263,205 @@
263263
"best_clf = best_algo.best_estimator_\n",
264264
"final_model = best_clf.fit(X, y)"
265265
]
266+
},
267+
{
268+
"cell_type": "markdown",
269+
"metadata": {},
270+
"source": [
271+
"# Nested CV for algorithm selection in scikit-learn 0.18"
272+
]
273+
},
274+
{
275+
"cell_type": "code",
276+
"execution_count": 1,
277+
"metadata": {
278+
"collapsed": false
279+
},
280+
"outputs": [
281+
{
282+
"name": "stdout",
283+
"output_type": "stream",
284+
"text": [
285+
"Sebastian Raschka 2016-09-30 \n",
286+
"\n",
287+
"CPython 3.5.2\n",
288+
"IPython 5.1.0\n",
289+
"\n",
290+
"sklearn 0.18\n",
291+
"mlxtend 0.4.3dev0\n"
292+
]
293+
}
294+
],
295+
"source": [
296+
"%load_ext watermark\n",
297+
"%watermark -a 'Sebastian Raschka' -d -p sklearn,mlxtend -v"
298+
]
299+
},
300+
{
301+
"cell_type": "markdown",
302+
"metadata": {
303+
"collapsed": true
304+
},
305+
"source": [
306+
"There were a lot of neat changes introduced in [scikit-learn 0.18](http://scikit-learn.org/dev/whats_new.html), released on on 28 Sep, 2016, that make nested CV a lot more convenient. "
307+
]
308+
},
309+
{
310+
"cell_type": "code",
311+
"execution_count": 2,
312+
"metadata": {
313+
"collapsed": false
314+
},
315+
"outputs": [],
316+
"source": [
317+
"import numpy as np\n",
318+
"from sklearn.model_selection import GridSearchCV\n",
319+
"from sklearn.model_selection import train_test_split\n",
320+
"from sklearn.model_selection import StratifiedKFold\n",
321+
"from sklearn.model_selection import cross_val_score\n",
322+
"from sklearn.pipeline import Pipeline\n",
323+
"from sklearn.preprocessing import StandardScaler\n",
324+
"from sklearn.linear_model import LogisticRegression\n",
325+
"from sklearn.neighbors import KNeighborsClassifier\n",
326+
"from sklearn.tree import DecisionTreeClassifier\n",
327+
"from sklearn.svm import SVC\n",
328+
"from mlxtend.data import mnist_data\n",
329+
"from sklearn.metrics import accuracy_score\n",
330+
"\n",
331+
"# Loading and splitting the dataset\n",
332+
"# Note that this is a small (stratified) subset\n",
333+
"# of MNIST; it consists of 5000 samples only, that is,\n",
334+
"# 10% of the original MNIST dataset\n",
335+
"# http://yann.lecun.com/exdb/mnist/\n",
336+
"X, y = mnist_data()\n",
337+
"X = X.astype(np.float32)\n",
338+
"X_train, X_test, y_train, y_test = train_test_split(X, y,\n",
339+
" train_size=0.8,\n",
340+
" random_state=1,\n",
341+
" stratify=y)\n",
342+
"\n",
343+
"# Initializing Classifiers\n",
344+
"clf1 = LogisticRegression(multi_class='multinomial',\n",
345+
" solver='newton-cg',\n",
346+
" random_state=1)\n",
347+
"clf2 = KNeighborsClassifier(algorithm='ball_tree',\n",
348+
" leaf_size=50)\n",
349+
"clf3 = DecisionTreeClassifier(random_state=1)\n",
350+
"clf4 = SVC(random_state=1)\n",
351+
"\n",
352+
"# Building the pipelines\n",
353+
"pipe1 = Pipeline([('std', StandardScaler()),\n",
354+
" ('clf1', clf1)])\n",
355+
"\n",
356+
"pipe2 = Pipeline([('std', StandardScaler()),\n",
357+
" ('clf2', clf2)])\n",
358+
"\n",
359+
"pipe4 = Pipeline([('std', StandardScaler()),\n",
360+
" ('clf4', clf4)])\n",
361+
"\n",
362+
"\n",
363+
"# Setting up the parameter grids\n",
364+
"param_grid1 = [{'clf1__penalty': ['l2'],\n",
365+
" 'clf1__C': np.power(10., np.arange(-4, 4))}]\n",
366+
"\n",
367+
"param_grid2 = [{'clf2__n_neighbors': list(range(1, 10)),\n",
368+
" 'clf2__p': [1, 2]}]\n",
369+
"\n",
370+
"param_grid3 = [{'max_depth': list(range(1, 10)) + [None],\n",
371+
" 'criterion': ['gini', 'entropy']}]\n",
372+
"\n",
373+
"param_grid4 = [{'clf4__kernel': ['rbf'],\n",
374+
" 'clf4__C': np.power(10., np.arange(-4, 4)),\n",
375+
" 'clf4__gamma': np.power(10., np.arange(-5, 0))},\n",
376+
" {'clf4__kernel': ['linear'],\n",
377+
" 'clf4__C': np.power(10., np.arange(-4, 4))}]\n",
378+
"\n",
379+
"# Setting up multiple GridSearchCV objects, 1 for each algorithm\n",
380+
"gridcvs = {}\n",
381+
"inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=1)\n",
382+
"\n",
383+
"for pgrid, est, name in zip((param_grid1, param_grid2,\n",
384+
" param_grid3, param_grid4),\n",
385+
" (pipe1, pipe2, clf3, pipe4),\n",
386+
" ('Softmax', 'KNN', 'DTree', 'SVM')):\n",
387+
" gcv = GridSearchCV(estimator=est,\n",
388+
" param_grid=pgrid,\n",
389+
" scoring='accuracy',\n",
390+
" n_jobs=1,\n",
391+
" cv=inner_cv,\n",
392+
" verbose=0,\n",
393+
" refit=True)\n",
394+
" gridcvs[name] = gcv"
395+
]
396+
},
397+
{
398+
"cell_type": "code",
399+
"execution_count": 3,
400+
"metadata": {
401+
"collapsed": false
402+
},
403+
"outputs": [
404+
{
405+
"name": "stdout",
406+
"output_type": "stream",
407+
"text": [
408+
"DTree | outer ACC 77.33% +/- 2.72\n",
409+
"KNN | outer ACC 91.10% +/- 0.96\n",
410+
"SVM | outer ACC 91.95% +/- 1.04\n",
411+
"Softmax | outer ACC 90.32% +/- 1.22\n"
412+
]
413+
}
414+
],
415+
"source": [
416+
"outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)\n",
417+
"\n",
418+
"for name, gs_est in sorted(gridcvs.items()):\n",
419+
" nested_score = cross_val_score(gs_est, \n",
420+
" X=X_train, \n",
421+
" y=y_train, \n",
422+
" cv=outer_cv,\n",
423+
" n_jobs=1)\n",
424+
" print('%s | outer ACC %.2f%% +/- %.2f' % \n",
425+
" (name, nested_score.mean() * 100, nested_score.std() * 100))"
426+
]
427+
},
428+
{
429+
"cell_type": "code",
430+
"execution_count": 4,
431+
"metadata": {
432+
"collapsed": false
433+
},
434+
"outputs": [
435+
{
436+
"name": "stdout",
437+
"output_type": "stream",
438+
"text": [
439+
"Accuracy 91.03% (average over CV test folds)\n",
440+
"Best Parameters: {'clf4__C': 10.0, 'clf4__gamma': 0.001, 'clf4__kernel': 'rbf'}\n",
441+
"Training Accuracy: 99.92%\n",
442+
"Test Accuracy: 93.00%\n"
443+
]
444+
}
445+
],
446+
"source": [
447+
"# Fitting a model to the whole training set\n",
448+
"# using the \"best\" algorithm\n",
449+
"best_algo = gridcvs['SVM']\n",
450+
"\n",
451+
"best_algo.fit(X_train, y_train)\n",
452+
"train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))\n",
453+
"test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))\n",
454+
"\n",
455+
"print('Accuracy %.2f%% (average over CV test folds)' %\n",
456+
" (100 * best_algo.best_score_))\n",
457+
"print('Best Parameters: %s' % gridcvs['SVM'].best_params_)\n",
458+
"print('Training Accuracy: %.2f%%' % (100 * train_acc))\n",
459+
"print('Test Accuracy: %.2f%%' % (100 * test_acc))"
460+
]
266461
}
267462
],
268463
"metadata": {
464+
"anaconda-cloud": {},
269465
"kernelspec": {
270466
"display_name": "Python 3",
271467
"language": "python",

0 commit comments

Comments
 (0)