|
1 | 1 | { |
2 | 2 | "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "markdown", |
| 5 | + "metadata": {}, |
| 6 | + "source": [ |
| 7 | + "**This Jupyter notebook contains the complimentary code for the Appendix section of the article \"Model evaluation, model selection, and algorithm selection in machine learning - Part IV\" at http://sebastianraschka.com/blog/2016/model-evaluation-selection-part4.html.**\n" |
| 8 | + ] |
| 9 | + }, |
| 10 | + { |
| 11 | + "cell_type": "markdown", |
| 12 | + "metadata": {}, |
| 13 | + "source": [ |
| 14 | + "# A \"nested cross-validation for algorithm selection\" example using scikit-learn" |
| 15 | + ] |
| 16 | + }, |
3 | 17 | { |
4 | 18 | "cell_type": "code", |
5 | 19 | "execution_count": 1, |
|
11 | 25 | "name": "stdout", |
12 | 26 | "output_type": "stream", |
13 | 27 | "text": [ |
14 | | - "Sebastian Raschka 2016-09-04 \n", |
| 28 | + "Sebastian Raschka 2016-09-30 \n", |
15 | 29 | "\n", |
16 | 30 | "CPython 3.5.2\n", |
17 | 31 | "IPython 5.1.0\n", |
|
26 | 40 | "%watermark -a 'Sebastian Raschka' -d -p sklearn,mlxtend -v" |
27 | 41 | ] |
28 | 42 | }, |
29 | | - { |
30 | | - "cell_type": "markdown", |
31 | | - "metadata": {}, |
32 | | - "source": [ |
33 | | - "**This Jupyter notebook contains the code to create the data visualizations for the article \"Model evaluation, model selection, and algorithm selection in machine learning - Part IV\" at http://sebastianraschka.com/blog/2016/model-evaluation-selection-part4.html.**\n" |
34 | | - ] |
35 | | - }, |
36 | | - { |
37 | | - "cell_type": "markdown", |
38 | | - "metadata": {}, |
39 | | - "source": [ |
40 | | - "# A Nested cross-validation example using scikit-learn" |
41 | | - ] |
42 | | - }, |
43 | 43 | { |
44 | 44 | "cell_type": "code", |
45 | 45 | "execution_count": 2, |
|
75 | 75 | "X = X.astype(np.float32)\n", |
76 | 76 | "X_train, X_test, y_train, y_test = train_test_split(X, y,\n", |
77 | 77 | " train_size=0.8,\n", |
78 | | - " random_state=123,\n", |
| 78 | + " random_state=1,\n", |
79 | 79 | " stratify=y)\n", |
80 | 80 | "\n", |
81 | 81 | "# Initializing Classifiers\n", |
82 | 82 | "clf1 = LogisticRegression(multi_class='multinomial',\n", |
83 | 83 | " solver='newton-cg',\n", |
84 | | - " random_state=123)\n", |
| 84 | + " random_state=1)\n", |
85 | 85 | "clf2 = KNeighborsClassifier(algorithm='ball_tree',\n", |
86 | 86 | " leaf_size=50)\n", |
87 | | - "clf3 = DecisionTreeClassifier(random_state=123)\n", |
| 87 | + "clf3 = DecisionTreeClassifier(random_state=1)\n", |
88 | 88 | "clf4 = SVC(random_state=1)\n", |
89 | 89 | "\n", |
90 | 90 | "# Building the pipelines\n", |
|
143 | 143 | "name": "stdout", |
144 | 144 | "output_type": "stream", |
145 | 145 | "text": [ |
146 | | - "outer fold 1/5 | tuning DTree | inner ACC 72.59% | outer ACC 76.00%\n", |
147 | | - "outer fold 1/5 | tuning KNN | inner ACC 88.06% | outer ACC 90.75%\n", |
148 | | - "outer fold 1/5 | tuning SVM | inner ACC 90.12% | outer ACC 91.50%\n", |
149 | | - "outer fold 1/5 | tuning Softmax | inner ACC 87.88% | outer ACC 89.00%\n", |
150 | | - "outer fold 2/5 | tuning DTree | inner ACC 72.88% | outer ACC 77.12%\n", |
151 | | - "outer fold 2/5 | tuning KNN | inner ACC 88.28% | outer ACC 91.88%\n", |
152 | | - "outer fold 2/5 | tuning SVM | inner ACC 90.06% | outer ACC 91.62%\n", |
153 | | - "outer fold 2/5 | tuning Softmax | inner ACC 87.81% | outer ACC 90.62%\n", |
154 | | - "outer fold 3/5 | tuning DTree | inner ACC 74.16% | outer ACC 78.38%\n", |
155 | | - "outer fold 3/5 | tuning KNN | inner ACC 87.88% | outer ACC 90.38%\n", |
156 | | - "outer fold 3/5 | tuning SVM | inner ACC 89.75% | outer ACC 92.25%\n", |
157 | | - "outer fold 3/5 | tuning Softmax | inner ACC 87.78% | outer ACC 89.88%\n", |
158 | | - "outer fold 4/5 | tuning DTree | inner ACC 74.47% | outer ACC 75.88%\n", |
159 | | - "outer fold 4/5 | tuning KNN | inner ACC 88.44% | outer ACC 90.38%\n", |
160 | | - "outer fold 4/5 | tuning SVM | inner ACC 90.41% | outer ACC 92.00%\n", |
161 | | - "outer fold 4/5 | tuning Softmax | inner ACC 88.59% | outer ACC 89.38%\n", |
162 | | - "outer fold 5/5 | tuning DTree | inner ACC 72.59% | outer ACC 70.62%\n", |
163 | | - "outer fold 5/5 | tuning KNN | inner ACC 88.69% | outer ACC 90.50%\n", |
164 | | - "outer fold 5/5 | tuning SVM | inner ACC 90.41% | outer ACC 90.50%\n", |
165 | | - "outer fold 5/5 | tuning Softmax | inner ACC 88.16% | outer ACC 89.50%\n" |
| 146 | + "outer fold 1/5 | tuning DTree | inner ACC 72.38% | outer ACC 81.25%\n", |
| 147 | + "outer fold 1/5 | tuning KNN | inner ACC 88.19% | outer ACC 90.62%\n", |
| 148 | + "outer fold 1/5 | tuning SVM | inner ACC 89.88% | outer ACC 92.62%\n", |
| 149 | + "outer fold 1/5 | tuning Softmax | inner ACC 88.22% | outer ACC 91.88%\n", |
| 150 | + "outer fold 2/5 | tuning DTree | inner ACC 75.16% | outer ACC 76.25%\n", |
| 151 | + "outer fold 2/5 | tuning KNN | inner ACC 88.62% | outer ACC 90.62%\n", |
| 152 | + "outer fold 2/5 | tuning SVM | inner ACC 90.84% | outer ACC 91.25%\n", |
| 153 | + "outer fold 2/5 | tuning Softmax | inner ACC 89.00% | outer ACC 90.62%\n", |
| 154 | + "outer fold 3/5 | tuning DTree | inner ACC 74.25% | outer ACC 78.75%\n", |
| 155 | + "outer fold 3/5 | tuning KNN | inner ACC 87.81% | outer ACC 93.00%\n", |
| 156 | + "outer fold 3/5 | tuning SVM | inner ACC 89.69% | outer ACC 92.12%\n", |
| 157 | + "outer fold 3/5 | tuning Softmax | inner ACC 89.03% | outer ACC 90.38%\n", |
| 158 | + "outer fold 4/5 | tuning DTree | inner ACC 75.03% | outer ACC 73.62%\n", |
| 159 | + "outer fold 4/5 | tuning KNN | inner ACC 88.88% | outer ACC 90.50%\n", |
| 160 | + "outer fold 4/5 | tuning SVM | inner ACC 90.78% | outer ACC 90.38%\n", |
| 161 | + "outer fold 4/5 | tuning Softmax | inner ACC 89.25% | outer ACC 86.50%\n", |
| 162 | + "outer fold 5/5 | tuning DTree | inner ACC 73.31% | outer ACC 76.25%\n", |
| 163 | + "outer fold 5/5 | tuning KNN | inner ACC 88.41% | outer ACC 90.88%\n", |
| 164 | + "outer fold 5/5 | tuning SVM | inner ACC 90.28% | outer ACC 93.00%\n", |
| 165 | + "outer fold 5/5 | tuning Softmax | inner ACC 88.16% | outer ACC 90.62%\n" |
166 | 166 | ] |
167 | 167 | } |
168 | 168 | ], |
169 | 169 | "source": [ |
170 | 170 | "cv_scores = {name: [] for name, gs_est in gridcvs.items()}\n", |
171 | 171 | "\n", |
172 | | - "skfold = StratifiedKFold(y=y_train, n_folds=5, shuffle=True, random_state=123)\n", |
| 172 | + "skfold = StratifiedKFold(y=y_train, n_folds=5, shuffle=True, random_state=1)\n", |
173 | 173 | "\n", |
174 | 174 | "# The outer loop for algorithm selection\n", |
175 | 175 | "c = 1\n", |
|
199 | 199 | "name": "stdout", |
200 | 200 | "output_type": "stream", |
201 | 201 | "text": [ |
202 | | - "KNN | outer CV acc. 90.78% +\\- 0.567\n", |
203 | | - "DTree | outer CV acc. 75.60% +\\- 2.646\n", |
204 | | - "SVM | outer CV acc. 91.58% +\\- 0.600\n", |
205 | | - "Softmax | outer CV acc. 89.68% +\\- 0.551\n", |
| 202 | + "DTree | outer CV acc. 77.22% +\\- 2.584\n", |
| 203 | + "KNN | outer CV acc. 91.13% +\\- 0.945\n", |
| 204 | + "Softmax | outer CV acc. 90.00% +\\- 1.827\n", |
| 205 | + "SVM | outer CV acc. 91.88% +\\- 0.952\n", |
206 | 206 | "\n", |
207 | | - "SVM Best parameters {'clf4__kernel': 'rbf', 'clf4__gamma': 1.0000000000000001e-05, 'clf4__C': 100.0}\n" |
| 207 | + "SVM Best parameters {'clf4__gamma': 0.001, 'clf4__C': 10.0, 'clf4__kernel': 'rbf'}\n" |
208 | 208 | ] |
209 | 209 | } |
210 | 210 | ], |
|
228 | 228 | "output_type": "stream", |
229 | 229 | "text": [ |
230 | 230 | "Accuracy 90.80% (average over CV test folds)\n", |
231 | | - "Best Parameters: {'clf4__kernel': 'rbf', 'clf4__gamma': 1.0000000000000001e-05, 'clf4__C': 100.0}\n", |
232 | | - "Training Accuracy: 96.10%\n", |
233 | | - "Test Accuracy: 92.70%\n" |
| 231 | + "Best Parameters: {'clf4__gamma': 0.001, 'clf4__C': 10.0, 'clf4__kernel': 'rbf'}\n", |
| 232 | + "Training Accuracy: 99.92%\n", |
| 233 | + "Test Accuracy: 93.00%\n" |
234 | 234 | ] |
235 | 235 | } |
236 | 236 | ], |
|
263 | 263 | "best_clf = best_algo.best_estimator_\n", |
264 | 264 | "final_model = best_clf.fit(X, y)" |
265 | 265 | ] |
| 266 | + }, |
| 267 | + { |
| 268 | + "cell_type": "markdown", |
| 269 | + "metadata": {}, |
| 270 | + "source": [ |
| 271 | + "# Nested CV for algorithm selection in scikit-learn 0.18" |
| 272 | + ] |
| 273 | + }, |
| 274 | + { |
| 275 | + "cell_type": "code", |
| 276 | + "execution_count": 1, |
| 277 | + "metadata": { |
| 278 | + "collapsed": false |
| 279 | + }, |
| 280 | + "outputs": [ |
| 281 | + { |
| 282 | + "name": "stdout", |
| 283 | + "output_type": "stream", |
| 284 | + "text": [ |
| 285 | + "Sebastian Raschka 2016-09-30 \n", |
| 286 | + "\n", |
| 287 | + "CPython 3.5.2\n", |
| 288 | + "IPython 5.1.0\n", |
| 289 | + "\n", |
| 290 | + "sklearn 0.18\n", |
| 291 | + "mlxtend 0.4.3dev0\n" |
| 292 | + ] |
| 293 | + } |
| 294 | + ], |
| 295 | + "source": [ |
| 296 | + "%load_ext watermark\n", |
| 297 | + "%watermark -a 'Sebastian Raschka' -d -p sklearn,mlxtend -v" |
| 298 | + ] |
| 299 | + }, |
| 300 | + { |
| 301 | + "cell_type": "markdown", |
| 302 | + "metadata": { |
| 303 | + "collapsed": true |
| 304 | + }, |
| 305 | + "source": [ |
| 306 | + "There were a lot of neat changes introduced in [scikit-learn 0.18](http://scikit-learn.org/dev/whats_new.html), released on on 28 Sep, 2016, that make nested CV a lot more convenient. " |
| 307 | + ] |
| 308 | + }, |
| 309 | + { |
| 310 | + "cell_type": "code", |
| 311 | + "execution_count": 2, |
| 312 | + "metadata": { |
| 313 | + "collapsed": false |
| 314 | + }, |
| 315 | + "outputs": [], |
| 316 | + "source": [ |
| 317 | + "import numpy as np\n", |
| 318 | + "from sklearn.model_selection import GridSearchCV\n", |
| 319 | + "from sklearn.model_selection import train_test_split\n", |
| 320 | + "from sklearn.model_selection import StratifiedKFold\n", |
| 321 | + "from sklearn.model_selection import cross_val_score\n", |
| 322 | + "from sklearn.pipeline import Pipeline\n", |
| 323 | + "from sklearn.preprocessing import StandardScaler\n", |
| 324 | + "from sklearn.linear_model import LogisticRegression\n", |
| 325 | + "from sklearn.neighbors import KNeighborsClassifier\n", |
| 326 | + "from sklearn.tree import DecisionTreeClassifier\n", |
| 327 | + "from sklearn.svm import SVC\n", |
| 328 | + "from mlxtend.data import mnist_data\n", |
| 329 | + "from sklearn.metrics import accuracy_score\n", |
| 330 | + "\n", |
| 331 | + "# Loading and splitting the dataset\n", |
| 332 | + "# Note that this is a small (stratified) subset\n", |
| 333 | + "# of MNIST; it consists of 5000 samples only, that is,\n", |
| 334 | + "# 10% of the original MNIST dataset\n", |
| 335 | + "# http://yann.lecun.com/exdb/mnist/\n", |
| 336 | + "X, y = mnist_data()\n", |
| 337 | + "X = X.astype(np.float32)\n", |
| 338 | + "X_train, X_test, y_train, y_test = train_test_split(X, y,\n", |
| 339 | + " train_size=0.8,\n", |
| 340 | + " random_state=1,\n", |
| 341 | + " stratify=y)\n", |
| 342 | + "\n", |
| 343 | + "# Initializing Classifiers\n", |
| 344 | + "clf1 = LogisticRegression(multi_class='multinomial',\n", |
| 345 | + " solver='newton-cg',\n", |
| 346 | + " random_state=1)\n", |
| 347 | + "clf2 = KNeighborsClassifier(algorithm='ball_tree',\n", |
| 348 | + " leaf_size=50)\n", |
| 349 | + "clf3 = DecisionTreeClassifier(random_state=1)\n", |
| 350 | + "clf4 = SVC(random_state=1)\n", |
| 351 | + "\n", |
| 352 | + "# Building the pipelines\n", |
| 353 | + "pipe1 = Pipeline([('std', StandardScaler()),\n", |
| 354 | + " ('clf1', clf1)])\n", |
| 355 | + "\n", |
| 356 | + "pipe2 = Pipeline([('std', StandardScaler()),\n", |
| 357 | + " ('clf2', clf2)])\n", |
| 358 | + "\n", |
| 359 | + "pipe4 = Pipeline([('std', StandardScaler()),\n", |
| 360 | + " ('clf4', clf4)])\n", |
| 361 | + "\n", |
| 362 | + "\n", |
| 363 | + "# Setting up the parameter grids\n", |
| 364 | + "param_grid1 = [{'clf1__penalty': ['l2'],\n", |
| 365 | + " 'clf1__C': np.power(10., np.arange(-4, 4))}]\n", |
| 366 | + "\n", |
| 367 | + "param_grid2 = [{'clf2__n_neighbors': list(range(1, 10)),\n", |
| 368 | + " 'clf2__p': [1, 2]}]\n", |
| 369 | + "\n", |
| 370 | + "param_grid3 = [{'max_depth': list(range(1, 10)) + [None],\n", |
| 371 | + " 'criterion': ['gini', 'entropy']}]\n", |
| 372 | + "\n", |
| 373 | + "param_grid4 = [{'clf4__kernel': ['rbf'],\n", |
| 374 | + " 'clf4__C': np.power(10., np.arange(-4, 4)),\n", |
| 375 | + " 'clf4__gamma': np.power(10., np.arange(-5, 0))},\n", |
| 376 | + " {'clf4__kernel': ['linear'],\n", |
| 377 | + " 'clf4__C': np.power(10., np.arange(-4, 4))}]\n", |
| 378 | + "\n", |
| 379 | + "# Setting up multiple GridSearchCV objects, 1 for each algorithm\n", |
| 380 | + "gridcvs = {}\n", |
| 381 | + "inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=1)\n", |
| 382 | + "\n", |
| 383 | + "for pgrid, est, name in zip((param_grid1, param_grid2,\n", |
| 384 | + " param_grid3, param_grid4),\n", |
| 385 | + " (pipe1, pipe2, clf3, pipe4),\n", |
| 386 | + " ('Softmax', 'KNN', 'DTree', 'SVM')):\n", |
| 387 | + " gcv = GridSearchCV(estimator=est,\n", |
| 388 | + " param_grid=pgrid,\n", |
| 389 | + " scoring='accuracy',\n", |
| 390 | + " n_jobs=1,\n", |
| 391 | + " cv=inner_cv,\n", |
| 392 | + " verbose=0,\n", |
| 393 | + " refit=True)\n", |
| 394 | + " gridcvs[name] = gcv" |
| 395 | + ] |
| 396 | + }, |
| 397 | + { |
| 398 | + "cell_type": "code", |
| 399 | + "execution_count": 3, |
| 400 | + "metadata": { |
| 401 | + "collapsed": false |
| 402 | + }, |
| 403 | + "outputs": [ |
| 404 | + { |
| 405 | + "name": "stdout", |
| 406 | + "output_type": "stream", |
| 407 | + "text": [ |
| 408 | + "DTree | outer ACC 77.33% +/- 2.72\n", |
| 409 | + "KNN | outer ACC 91.10% +/- 0.96\n", |
| 410 | + "SVM | outer ACC 91.95% +/- 1.04\n", |
| 411 | + "Softmax | outer ACC 90.32% +/- 1.22\n" |
| 412 | + ] |
| 413 | + } |
| 414 | + ], |
| 415 | + "source": [ |
| 416 | + "outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)\n", |
| 417 | + "\n", |
| 418 | + "for name, gs_est in sorted(gridcvs.items()):\n", |
| 419 | + " nested_score = cross_val_score(gs_est, \n", |
| 420 | + " X=X_train, \n", |
| 421 | + " y=y_train, \n", |
| 422 | + " cv=outer_cv,\n", |
| 423 | + " n_jobs=1)\n", |
| 424 | + " print('%s | outer ACC %.2f%% +/- %.2f' % \n", |
| 425 | + " (name, nested_score.mean() * 100, nested_score.std() * 100))" |
| 426 | + ] |
| 427 | + }, |
| 428 | + { |
| 429 | + "cell_type": "code", |
| 430 | + "execution_count": 4, |
| 431 | + "metadata": { |
| 432 | + "collapsed": false |
| 433 | + }, |
| 434 | + "outputs": [ |
| 435 | + { |
| 436 | + "name": "stdout", |
| 437 | + "output_type": "stream", |
| 438 | + "text": [ |
| 439 | + "Accuracy 91.03% (average over CV test folds)\n", |
| 440 | + "Best Parameters: {'clf4__C': 10.0, 'clf4__gamma': 0.001, 'clf4__kernel': 'rbf'}\n", |
| 441 | + "Training Accuracy: 99.92%\n", |
| 442 | + "Test Accuracy: 93.00%\n" |
| 443 | + ] |
| 444 | + } |
| 445 | + ], |
| 446 | + "source": [ |
| 447 | + "# Fitting a model to the whole training set\n", |
| 448 | + "# using the \"best\" algorithm\n", |
| 449 | + "best_algo = gridcvs['SVM']\n", |
| 450 | + "\n", |
| 451 | + "best_algo.fit(X_train, y_train)\n", |
| 452 | + "train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))\n", |
| 453 | + "test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))\n", |
| 454 | + "\n", |
| 455 | + "print('Accuracy %.2f%% (average over CV test folds)' %\n", |
| 456 | + " (100 * best_algo.best_score_))\n", |
| 457 | + "print('Best Parameters: %s' % gridcvs['SVM'].best_params_)\n", |
| 458 | + "print('Training Accuracy: %.2f%%' % (100 * train_acc))\n", |
| 459 | + "print('Test Accuracy: %.2f%%' % (100 * test_acc))" |
| 460 | + ] |
266 | 461 | } |
267 | 462 | ], |
268 | 463 | "metadata": { |
| 464 | + "anaconda-cloud": {}, |
269 | 465 | "kernelspec": { |
270 | 466 | "display_name": "Python 3", |
271 | 467 | "language": "python", |
|
0 commit comments