|
22 | 22 |
|
23 | 23 | import numpy as np |
24 | 24 | import matplotlib.pyplot as plt |
| 25 | +import pandas as pd |
25 | 26 |
|
26 | | -from sklearn import linear_model, decomposition, datasets |
| 27 | +from sklearn import datasets |
| 28 | +from sklearn.decomposition import PCA |
| 29 | +from sklearn.linear_model import SGDClassifier |
27 | 30 | from sklearn.pipeline import Pipeline |
28 | 31 | from sklearn.model_selection import GridSearchCV |
29 | 32 |
|
30 | | -logistic = linear_model.LogisticRegression() |
31 | 33 |
|
32 | | -pca = decomposition.PCA() |
| 34 | +# Define a pipeline to search for the best combination of PCA truncation |
| 35 | +# and classifier regularization. |
| 36 | +logistic = SGDClassifier(loss='log', penalty='l2', early_stopping=True, |
| 37 | + max_iter=10000, tol=1e-5, random_state=0) |
| 38 | +pca = PCA() |
33 | 39 | pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)]) |
34 | 40 |
|
35 | 41 | digits = datasets.load_digits() |
36 | 42 | X_digits = digits.data |
37 | 43 | y_digits = digits.target |
38 | 44 |
|
| 45 | +# Parameters of pipelines can be set using ‘__’ separated parameter names: |
| 46 | +param_grid = { |
| 47 | + 'pca__n_components': [5, 20, 30, 40, 50, 64], |
| 48 | + 'logistic__alpha': np.logspace(-4, 4, 5), |
| 49 | +} |
| 50 | +search = GridSearchCV(pipe, param_grid, iid=False, cv=5, |
| 51 | + return_train_score=False) |
| 52 | +search.fit(X_digits, y_digits) |
| 53 | +print("Best parameter (CV score=%0.3f):" % search.best_score_) |
| 54 | +print(search.best_params_) |
| 55 | + |
39 | 56 | # Plot the PCA spectrum |
40 | 57 | pca.fit(X_digits) |
41 | 58 |
|
42 | | -plt.figure(1, figsize=(4, 3)) |
43 | | -plt.clf() |
44 | | -plt.axes([.2, .2, .7, .7]) |
45 | | -plt.plot(pca.explained_variance_, linewidth=2) |
46 | | -plt.axis('tight') |
47 | | -plt.xlabel('n_components') |
48 | | -plt.ylabel('explained_variance_') |
| 59 | +fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(6, 6)) |
| 60 | +ax0.plot(pca.explained_variance_ratio_, linewidth=2) |
| 61 | +ax0.set_ylabel('PCA explained variance') |
| 62 | + |
| 63 | +ax0.axvline(search.best_estimator_.named_steps['pca'].n_components, |
| 64 | + linestyle=':', label='n_components chosen') |
| 65 | +ax0.legend(prop=dict(size=12)) |
49 | 66 |
|
50 | | -# Prediction |
51 | | -n_components = [20, 40, 64] |
52 | | -Cs = np.logspace(-4, 4, 3) |
| 67 | +# For each number of components, find the best classifier results |
| 68 | +results = pd.DataFrame(search.cv_results_) |
| 69 | +components_col = 'param_pca__n_components' |
| 70 | +best_clfs = results.groupby(components_col).apply( |
| 71 | + lambda g: g.nlargest(1, 'mean_test_score')) |
53 | 72 |
|
54 | | -# Parameters of pipelines can be set using ‘__’ separated parameter names: |
55 | | -estimator = GridSearchCV(pipe, |
56 | | - dict(pca__n_components=n_components, |
57 | | - logistic__C=Cs), cv=5) |
58 | | -estimator.fit(X_digits, y_digits) |
| 73 | +best_clfs.plot(x=components_col, y='mean_test_score', yerr='std_test_score', |
| 74 | + legend=False, ax=ax1) |
| 75 | +ax1.set_ylabel('Classification accuracy (val)') |
| 76 | +ax1.set_xlabel('n_components') |
59 | 77 |
|
60 | | -plt.axvline(estimator.best_estimator_.named_steps['pca'].n_components, |
61 | | - linestyle=':', label='n_components chosen') |
62 | | -plt.legend(prop=dict(size=12)) |
| 78 | +plt.tight_layout() |
63 | 79 | plt.show() |
0 commit comments