rasbt
diff --git a/‎data_viz/model-evaluation-articles/nested_cv_code.ipynb‎
Lines changed: 243 additions & 47 deletions b/‎data_viz/model-evaluation-articles/nested_cv_code.ipynb‎
Lines changed: 243 additions & 47 deletions
@@ -1,5 +1,19 @@
 {
  "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**This Jupyter notebook contains the complimentary code for the Appendix section of the article \"Model evaluation, model selection, and algorithm selection in machine learning - Part IV\" at http://sebastianraschka.com/blog/2016/model-evaluation-selection-part4.html.**\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# A \"nested cross-validation for algorithm selection\" example using scikit-learn"
+ ]
+ },
  {
  "cell_type": "code",
  "execution_count": 1,
@@ -11,7 +25,7 @@
  "name": "stdout",
  "output_type": "stream",
  "text": [
- "Sebastian Raschka 2016-09-04 \n",
+ "Sebastian Raschka 2016-09-30 \n",
  "\n",
  "CPython 3.5.2\n",
  "IPython 5.1.0\n",
@@ -26,20 +40,6 @@
  "%watermark -a 'Sebastian Raschka' -d -p sklearn,mlxtend -v"
  ]
  },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "**This Jupyter notebook contains the code to create the data visualizations for the article \"Model evaluation, model selection, and algorithm selection in machine learning - Part IV\" at http://sebastianraschka.com/blog/2016/model-evaluation-selection-part4.html.**\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# A Nested cross-validation example using scikit-learn"
- ]
- },
  {
  "cell_type": "code",
  "execution_count": 2,
@@ -75,16 +75,16 @@
  "X = X.astype(np.float32)\n",
  "X_train, X_test, y_train, y_test = train_test_split(X, y,\n",
  " train_size=0.8,\n",
- " random_state=123,\n",
+ " random_state=1,\n",
  " stratify=y)\n",
  "\n",
  "# Initializing Classifiers\n",
  "clf1 = LogisticRegression(multi_class='multinomial',\n",
  " solver='newton-cg',\n",
- " random_state=123)\n",
+ " random_state=1)\n",
  "clf2 = KNeighborsClassifier(algorithm='ball_tree',\n",
  " leaf_size=50)\n",
- "clf3 = DecisionTreeClassifier(random_state=123)\n",
+ "clf3 = DecisionTreeClassifier(random_state=1)\n",
  "clf4 = SVC(random_state=1)\n",
  "\n",
  "# Building the pipelines\n",
@@ -143,33 +143,33 @@
  "name": "stdout",
  "output_type": "stream",
  "text": [
- "outer fold 1/5 | tuning DTree | inner ACC 72.59% | outer ACC 76.00%\n",
- "outer fold 1/5 | tuning KNN | inner ACC 88.06% | outer ACC 90.75%\n",
- "outer fold 1/5 | tuning SVM | inner ACC 90.12% | outer ACC 91.50%\n",
- "outer fold 1/5 | tuning Softmax | inner ACC 87.88% | outer ACC 89.00%\n",
- "outer fold 2/5 | tuning DTree | inner ACC 72.88% | outer ACC 77.12%\n",
- "outer fold 2/5 | tuning KNN | inner ACC 88.28% | outer ACC 91.88%\n",
- "outer fold 2/5 | tuning SVM | inner ACC 90.06% | outer ACC 91.62%\n",
- "outer fold 2/5 | tuning Softmax | inner ACC 87.81% | outer ACC 90.62%\n",
- "outer fold 3/5 | tuning DTree | inner ACC 74.16% | outer ACC 78.38%\n",
- "outer fold 3/5 | tuning KNN | inner ACC 87.88% | outer ACC 90.38%\n",
- "outer fold 3/5 | tuning SVM | inner ACC 89.75% | outer ACC 92.25%\n",
- "outer fold 3/5 | tuning Softmax | inner ACC 87.78% | outer ACC 89.88%\n",
- "outer fold 4/5 | tuning DTree | inner ACC 74.47% | outer ACC 75.88%\n",
- "outer fold 4/5 | tuning KNN | inner ACC 88.44% | outer ACC 90.38%\n",
- "outer fold 4/5 | tuning SVM | inner ACC 90.41% | outer ACC 92.00%\n",
- "outer fold 4/5 | tuning Softmax | inner ACC 88.59% | outer ACC 89.38%\n",
- "outer fold 5/5 | tuning DTree | inner ACC 72.59% | outer ACC 70.62%\n",
- "outer fold 5/5 | tuning KNN | inner ACC 88.69% | outer ACC 90.50%\n",
- "outer fold 5/5 | tuning SVM | inner ACC 90.41% | outer ACC 90.50%\n",
- "outer fold 5/5 | tuning Softmax | inner ACC 88.16% | outer ACC 89.50%\n"
+ "outer fold 1/5 | tuning DTree | inner ACC 72.38% | outer ACC 81.25%\n",
+ "outer fold 1/5 | tuning KNN | inner ACC 88.19% | outer ACC 90.62%\n",
+ "outer fold 1/5 | tuning SVM | inner ACC 89.88% | outer ACC 92.62%\n",
+ "outer fold 1/5 | tuning Softmax | inner ACC 88.22% | outer ACC 91.88%\n",
+ "outer fold 2/5 | tuning DTree | inner ACC 75.16% | outer ACC 76.25%\n",
+ "outer fold 2/5 | tuning KNN | inner ACC 88.62% | outer ACC 90.62%\n",
+ "outer fold 2/5 | tuning SVM | inner ACC 90.84% | outer ACC 91.25%\n",
+ "outer fold 2/5 | tuning Softmax | inner ACC 89.00% | outer ACC 90.62%\n",
+ "outer fold 3/5 | tuning DTree | inner ACC 74.25% | outer ACC 78.75%\n",
+ "outer fold 3/5 | tuning KNN | inner ACC 87.81% | outer ACC 93.00%\n",
+ "outer fold 3/5 | tuning SVM | inner ACC 89.69% | outer ACC 92.12%\n",
+ "outer fold 3/5 | tuning Softmax | inner ACC 89.03% | outer ACC 90.38%\n",
+ "outer fold 4/5 | tuning DTree | inner ACC 75.03% | outer ACC 73.62%\n",
+ "outer fold 4/5 | tuning KNN | inner ACC 88.88% | outer ACC 90.50%\n",
+ "outer fold 4/5 | tuning SVM | inner ACC 90.78% | outer ACC 90.38%\n",
+ "outer fold 4/5 | tuning Softmax | inner ACC 89.25% | outer ACC 86.50%\n",
+ "outer fold 5/5 | tuning DTree | inner ACC 73.31% | outer ACC 76.25%\n",
+ "outer fold 5/5 | tuning KNN | inner ACC 88.41% | outer ACC 90.88%\n",
+ "outer fold 5/5 | tuning SVM | inner ACC 90.28% | outer ACC 93.00%\n",
+ "outer fold 5/5 | tuning Softmax | inner ACC 88.16% | outer ACC 90.62%\n"
  ]
  }
  ],
  "source": [
  "cv_scores = {name: [] for name, gs_est in gridcvs.items()}\n",
  "\n",
- "skfold = StratifiedKFold(y=y_train, n_folds=5, shuffle=True, random_state=123)\n",
+ "skfold = StratifiedKFold(y=y_train, n_folds=5, shuffle=True, random_state=1)\n",
  "\n",
  "# The outer loop for algorithm selection\n",
  "c = 1\n",
@@ -199,12 +199,12 @@
  "name": "stdout",
  "output_type": "stream",
  "text": [
- "KNN  | outer CV acc. 90.78% +\\- 0.567\n",
- "DTree | outer CV acc. 75.60% +\\- 2.646\n",
- "SVM | outer CV acc. 91.58% +\\- 0.600\n",
- "Softmax | outer CV acc. 89.68% +\\- 0.551\n",
+ "DTree | outer CV acc. 77.22% +\\- 2.584\n",
+ "KNN  | outer CV acc. 91.13% +\\- 0.945\n",
+ "Softmax | outer CV acc. 90.00% +\\- 1.827\n",
+ "SVM | outer CV acc. 91.88% +\\- 0.952\n",
  "\n",
- "SVM Best parameters {'clf4__kernel': 'rbf', 'clf4__gamma': 1.0000000000000001e-05, 'clf4__C': 100.0}\n"
+ "SVM Best parameters {'clf4__gamma': 0.001, 'clf4__C': 10.0, 'clf4__kernel': 'rbf'}\n"
  ]
  }
  ],
@@ -228,9 +228,9 @@
  "output_type": "stream",
  "text": [
  "Accuracy 90.80% (average over CV test folds)\n",
- "Best Parameters: {'clf4__kernel': 'rbf', 'clf4__gamma': 1.0000000000000001e-05, 'clf4__C': 100.0}\n",
- "Training Accuracy: 96.10%\n",
- "Test Accuracy: 92.70%\n"
+ "Best Parameters: {'clf4__gamma': 0.001, 'clf4__C': 10.0, 'clf4__kernel': 'rbf'}\n",
+ "Training Accuracy: 99.92%\n",
+ "Test Accuracy: 93.00%\n"
  ]
  }
  ],
@@ -263,9 +263,205 @@
  "best_clf = best_algo.best_estimator_\n",
  "final_model = best_clf.fit(X, y)"
  ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Nested CV for algorithm selection in scikit-learn 0.18"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Sebastian Raschka 2016-09-30 \n",
+ "\n",
+ "CPython 3.5.2\n",
+ "IPython 5.1.0\n",
+ "\n",
+ "sklearn 0.18\n",
+ "mlxtend 0.4.3dev0\n"
+ ]
+ }
+ ],
+ "source": [
+ "%load_ext watermark\n",
+ "%watermark -a 'Sebastian Raschka' -d -p sklearn,mlxtend -v"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": true
+ },
+ "source": [
+ "There were a lot of neat changes introduced in [scikit-learn 0.18](http://scikit-learn.org/dev/whats_new.html), released on on 28 Sep, 2016, that make nested CV a lot more convenient. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "from sklearn.model_selection import GridSearchCV\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.model_selection import StratifiedKFold\n",
+ "from sklearn.model_selection import cross_val_score\n",
+ "from sklearn.pipeline import Pipeline\n",
+ "from sklearn.preprocessing import StandardScaler\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "from sklearn.neighbors import KNeighborsClassifier\n",
+ "from sklearn.tree import DecisionTreeClassifier\n",
+ "from sklearn.svm import SVC\n",
+ "from mlxtend.data import mnist_data\n",
+ "from sklearn.metrics import accuracy_score\n",
+ "\n",
+ "# Loading and splitting the dataset\n",
+ "# Note that this is a small (stratified) subset\n",
+ "# of MNIST; it consists of 5000 samples only, that is,\n",
+ "# 10% of the original MNIST dataset\n",
+ "# http://yann.lecun.com/exdb/mnist/\n",
+ "X, y = mnist_data()\n",
+ "X = X.astype(np.float32)\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y,\n",
+ " train_size=0.8,\n",
+ " random_state=1,\n",
+ " stratify=y)\n",
+ "\n",
+ "# Initializing Classifiers\n",
+ "clf1 = LogisticRegression(multi_class='multinomial',\n",
+ " solver='newton-cg',\n",
+ " random_state=1)\n",
+ "clf2 = KNeighborsClassifier(algorithm='ball_tree',\n",
+ " leaf_size=50)\n",
+ "clf3 = DecisionTreeClassifier(random_state=1)\n",
+ "clf4 = SVC(random_state=1)\n",
+ "\n",
+ "# Building the pipelines\n",
+ "pipe1 = Pipeline([('std', StandardScaler()),\n",
+ " ('clf1', clf1)])\n",
+ "\n",
+ "pipe2 = Pipeline([('std', StandardScaler()),\n",
+ " ('clf2', clf2)])\n",
+ "\n",
+ "pipe4 = Pipeline([('std', StandardScaler()),\n",
+ " ('clf4', clf4)])\n",
+ "\n",
+ "\n",
+ "# Setting up the parameter grids\n",
+ "param_grid1 = [{'clf1__penalty': ['l2'],\n",
+ " 'clf1__C': np.power(10., np.arange(-4, 4))}]\n",
+ "\n",
+ "param_grid2 = [{'clf2__n_neighbors': list(range(1, 10)),\n",
+ " 'clf2__p': [1, 2]}]\n",
+ "\n",
+ "param_grid3 = [{'max_depth': list(range(1, 10)) + [None],\n",
+ " 'criterion': ['gini', 'entropy']}]\n",
+ "\n",
+ "param_grid4 = [{'clf4__kernel': ['rbf'],\n",
+ " 'clf4__C': np.power(10., np.arange(-4, 4)),\n",
+ " 'clf4__gamma': np.power(10., np.arange(-5, 0))},\n",
+ " {'clf4__kernel': ['linear'],\n",
+ " 'clf4__C': np.power(10., np.arange(-4, 4))}]\n",
+ "\n",
+ "# Setting up multiple GridSearchCV objects, 1 for each algorithm\n",
+ "gridcvs = {}\n",
+ "inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=1)\n",
+ "\n",
+ "for pgrid, est, name in zip((param_grid1, param_grid2,\n",
+ " param_grid3, param_grid4),\n",
+ " (pipe1, pipe2, clf3, pipe4),\n",
+ " ('Softmax', 'KNN', 'DTree', 'SVM')):\n",
+ " gcv = GridSearchCV(estimator=est,\n",
+ " param_grid=pgrid,\n",
+ " scoring='accuracy',\n",
+ " n_jobs=1,\n",
+ " cv=inner_cv,\n",
+ " verbose=0,\n",
+ " refit=True)\n",
+ " gridcvs[name] = gcv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "DTree | outer ACC 77.33% +/- 2.72\n",
+ "KNN | outer ACC 91.10% +/- 0.96\n",
+ "SVM | outer ACC 91.95% +/- 1.04\n",
+ "Softmax | outer ACC 90.32% +/- 1.22\n"
+ ]
+ }
+ ],
+ "source": [
+ "outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)\n",
+ "\n",
+ "for name, gs_est in sorted(gridcvs.items()):\n",
+ " nested_score = cross_val_score(gs_est, \n",
+ " X=X_train, \n",
+ " y=y_train, \n",
+ " cv=outer_cv,\n",
+ " n_jobs=1)\n",
+ " print('%s | outer ACC %.2f%% +/- %.2f' % \n",
+ " (name, nested_score.mean() * 100, nested_score.std() * 100))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy 91.03% (average over CV test folds)\n",
+ "Best Parameters: {'clf4__C': 10.0, 'clf4__gamma': 0.001, 'clf4__kernel': 'rbf'}\n",
+ "Training Accuracy: 99.92%\n",
+ "Test Accuracy: 93.00%\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Fitting a model to the whole training set\n",
+ "# using the \"best\" algorithm\n",
+ "best_algo = gridcvs['SVM']\n",
+ "\n",
+ "best_algo.fit(X_train, y_train)\n",
+ "train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))\n",
+ "test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))\n",
+ "\n",
+ "print('Accuracy %.2f%% (average over CV test folds)' %\n",
+ " (100 * best_algo.best_score_))\n",
+ "print('Best Parameters: %s' % gridcvs['SVM'].best_params_)\n",
+ "print('Training Accuracy: %.2f%%' % (100 * train_acc))\n",
+ "print('Test Accuracy: %.2f%%' % (100 * test_acc))"
+ ]
  }
  ],
  "metadata": {
+ "anaconda-cloud": {},
  "kernelspec": {
  "display_name": "Python 3",
  "language": "python",