|
| 1 | +# based on https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard |
| 2 | +# and https://www.kaggle.com/eikedehling/trying-out-stacking-approaches |
| 3 | +from sklearn.base import BaseEstimator, TransformerMixin, clone, RegressorMixin |
| 4 | +from sklearn.model_selection import KFold |
| 5 | +import numpy as np |
| 6 | + |
| 7 | + |
| 8 | +class StackingModels(BaseEstimator, RegressorMixin, TransformerMixin): |
| 9 | + def __init__(self, base_models, meta_model, n_folds=5, task_type='classification', use_features_in_secondary=False): |
| 10 | + self.base_models = base_models |
| 11 | + self.meta_model = meta_model |
| 12 | + self.n_folds = n_folds |
| 13 | + self.task_type = task_type |
| 14 | + self.use_features_in_secondary = use_features_in_secondary |
| 15 | + |
| 16 | + def fit(self, X, y): |
| 17 | + """Fit all the models on the given dataset""" |
| 18 | + self.base_models_ = [list() for _ in self.base_models] |
| 19 | + self.meta_model_ = clone(self.meta_model) |
| 20 | + kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=42) |
| 21 | + |
| 22 | + # Train cloned base models and create out-of-fold predictions |
| 23 | + out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models))) |
| 24 | + for i, model in enumerate(self.base_models): |
| 25 | + for train_index, holdout_index in kfold.split(X, y): |
| 26 | + instance = clone(model) |
| 27 | + self.base_models_[i].append(instance) |
| 28 | + instance.fit(X[train_index], y[train_index]) |
| 29 | + y_pred = instance.predict(X[holdout_index]) |
| 30 | + out_of_fold_predictions[holdout_index, i] = y_pred |
| 31 | + |
| 32 | + if self.use_features_in_secondary: |
| 33 | + self.meta_model_.fit(np.hstack((X, out_of_fold_predictions)), y) |
| 34 | + else: |
| 35 | + self.meta_model_.fit(out_of_fold_predictions, y) |
| 36 | + |
| 37 | + return self |
| 38 | + |
| 39 | + def predict(self, X): |
| 40 | + if self.task_type == 'classification': |
| 41 | + meta_features = np.column_stack([[np.argmax(np.bincount(predictions)) for predictions in |
| 42 | + np.column_stack([model.predict(X) for model in base_models])] |
| 43 | + for base_models in self.base_models_]) |
| 44 | + else: |
| 45 | + meta_features = np.column_stack([ |
| 46 | + np.column_stack([model.predict(X) for model in base_models]).mean(axis=1) |
| 47 | + for base_models in self.base_models_]) |
| 48 | + if self.use_features_in_secondary: |
| 49 | + return self.meta_model_.predict(np.hstack((X, meta_features))) |
| 50 | + else: |
| 51 | + return self.meta_model_.predict(meta_features) |
| 52 | + |
| 53 | + def predict_proba(self, X): |
| 54 | + if self.task_type == 'classification': |
| 55 | + meta_features = np.column_stack([[np.argmax(np.bincount(predictions)) for predictions in |
| 56 | + np.column_stack([model.predict(X) for model in base_models])] |
| 57 | + for base_models in self.base_models_]) |
| 58 | + if self.use_features_in_secondary: |
| 59 | + return self.meta_model_.predict_proba(np.hstack((X, meta_features))) |
| 60 | + else: |
| 61 | + return self.meta_model_.predict_proba(meta_features) |
0 commit comments