Skip to content

Commit 1d4db27

Browse files
committed
Added ensemble method example code
1 parent 0896968 commit 1d4db27

File tree

7 files changed

+283
-0
lines changed

7 files changed

+283
-0
lines changed

Ensemble_Methods/code/averaging.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# based on https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard
2+
# and https://www.kaggle.com/eikedehling/trying-out-stacking-approaches
3+
from sklearn.base import BaseEstimator, TransformerMixin, clone, RegressorMixin
4+
import numpy as np
5+
6+
7+
class AveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
8+
def __init__(self, models):
9+
self.models = models
10+
11+
def fit(self, X, y):
12+
self.models_ = [clone(x) for x in self.models]
13+
14+
# Train cloned base models
15+
for model in self.models_:
16+
model.fit(X, y)
17+
18+
return self
19+
20+
def predict(self, X):
21+
predictions = np.column_stack([
22+
model.predict(X) for model in self.models_
23+
])
24+
return np.mean(predictions, axis=1)

Ensemble_Methods/code/bagging.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
from sklearn.base import BaseEstimator, TransformerMixin, clone, RegressorMixin
2+
import numpy as np
3+
4+
5+
class BaggingModels(BaseEstimator, RegressorMixin, TransformerMixin):
6+
def __init__(self, models, task_type='classification'):
7+
self.models = models
8+
self.task_type = task_type
9+
10+
def fit(self, X, y):
11+
self.models_ = [clone(x) for x in self.models]
12+
13+
for model in self.models_:
14+
X_tmp, y_tmp = self.subsample(X, y)
15+
model.fit(X_tmp, y_tmp)
16+
17+
return self
18+
19+
# Create a random subsample from the dataset with replacement
20+
@staticmethod
21+
def subsample(X, y, ratio=1.0):
22+
X_new, y_new = list(), list()
23+
n_sample = round(len(X) * ratio)
24+
while len(X_new) < n_sample:
25+
index = np.random.randint(len(X))
26+
X_new.append(X[index])
27+
y_new.append(y[index])
28+
return X_new, y_new
29+
30+
def predict(self, X):
31+
predictions_array = np.column_stack([
32+
model.predict(X) for model in self.models_
33+
])
34+
if self.task_type == 'classification':
35+
return np.array([np.argmax(np.bincount(predictions)) for predictions in predictions_array])
36+
else:
37+
return np.mean(predictions_array, axis=1)
38+
39+
def predict_proba(self, X):
40+
if self.task_type == 'classification':
41+
predictions = []
42+
for x in X:
43+
prediction = np.row_stack([
44+
model.predict_proba([x]) for model in self.models_
45+
])
46+
predictions.append(np.mean(prediction, axis=0))
47+
return np.array(predictions)
48+
return None

Ensemble_Methods/code/blending.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
from sklearn.base import BaseEstimator, TransformerMixin, clone, RegressorMixin
2+
from sklearn.model_selection import train_test_split
3+
import numpy as np
4+
5+
6+
class BlendingModels(BaseEstimator, RegressorMixin, TransformerMixin):
7+
def __init__(self, base_models, meta_model, holdout_pct=0.2, use_features_in_secondary=False):
8+
self.base_models = base_models
9+
self.meta_model = meta_model
10+
self.holdout_pct = holdout_pct
11+
self.use_features_in_secondary = use_features_in_secondary
12+
13+
def fit(self, X, y):
14+
"""Fit all the models on the given dataset"""
15+
self.base_models_ = [clone(x) for x in self.base_models]
16+
self.meta_model_ = clone(self.meta_model)
17+
18+
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=self.holdout_pct)
19+
20+
holdout_predictions = np.zeros((X_holdout.shape[0], len(self.base_models)))
21+
for i, model in enumerate(self.base_models_):
22+
model.fit(X_train, y_train)
23+
y_pred = model.predict(X_holdout)
24+
holdout_predictions[:, i] = y_pred
25+
if self.use_features_in_secondary:
26+
self.meta_model_.fit(np.hstack((X_holdout, holdout_predictions)), y_holdout)
27+
else:
28+
self.meta_model_.fit(holdout_predictions, y_holdout)
29+
30+
return self
31+
32+
def predict(self, X):
33+
meta_features = np.column_stack([
34+
model.predict(X) for model in self.base_models_
35+
])
36+
if self.use_features_in_secondary:
37+
return self.meta_model_.predict(np.hstack((X, meta_features)))
38+
else:
39+
return self.meta_model_.predict(meta_features)
40+
41+
def predict_proba(self, X):
42+
meta_features = np.column_stack([
43+
model.predict(X) for model in self.base_models_
44+
])
45+
if self.use_features_in_secondary:
46+
return self.meta_model_.predict_proba(np.hstack((X, meta_features)))
47+
else:
48+
return self.meta_model_.predict_proba(meta_features)
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from sklearn.base import BaseEstimator, TransformerMixin, clone, ClassifierMixin
2+
import numpy as np
3+
4+
5+
class MajorityVote(BaseEstimator, ClassifierMixin, TransformerMixin):
6+
def __init__(self, models):
7+
self.models = models
8+
9+
def fit(self, X, y):
10+
self.models_ = [clone(x) for x in self.models]
11+
12+
# Train cloned base models
13+
for model in self.models_:
14+
model.fit(X, y)
15+
16+
return self
17+
18+
def predict(self, X):
19+
predictions_array = np.column_stack([
20+
model.predict(X) for model in self.models_
21+
])
22+
return np.array([np.argmax(np.bincount(predictions)) for predictions in predictions_array])

Ensemble_Methods/code/stacking.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# based on https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard
2+
# and https://www.kaggle.com/eikedehling/trying-out-stacking-approaches
3+
from sklearn.base import BaseEstimator, TransformerMixin, clone, RegressorMixin
4+
from sklearn.model_selection import KFold
5+
import numpy as np
6+
7+
8+
class StackingModels(BaseEstimator, RegressorMixin, TransformerMixin):
9+
def __init__(self, base_models, meta_model, n_folds=5, task_type='classification', use_features_in_secondary=False):
10+
self.base_models = base_models
11+
self.meta_model = meta_model
12+
self.n_folds = n_folds
13+
self.task_type = task_type
14+
self.use_features_in_secondary = use_features_in_secondary
15+
16+
def fit(self, X, y):
17+
"""Fit all the models on the given dataset"""
18+
self.base_models_ = [list() for _ in self.base_models]
19+
self.meta_model_ = clone(self.meta_model)
20+
kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=42)
21+
22+
# Train cloned base models and create out-of-fold predictions
23+
out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
24+
for i, model in enumerate(self.base_models):
25+
for train_index, holdout_index in kfold.split(X, y):
26+
instance = clone(model)
27+
self.base_models_[i].append(instance)
28+
instance.fit(X[train_index], y[train_index])
29+
y_pred = instance.predict(X[holdout_index])
30+
out_of_fold_predictions[holdout_index, i] = y_pred
31+
32+
if self.use_features_in_secondary:
33+
self.meta_model_.fit(np.hstack((X, out_of_fold_predictions)), y)
34+
else:
35+
self.meta_model_.fit(out_of_fold_predictions, y)
36+
37+
return self
38+
39+
def predict(self, X):
40+
if self.task_type == 'classification':
41+
meta_features = np.column_stack([[np.argmax(np.bincount(predictions)) for predictions in
42+
np.column_stack([model.predict(X) for model in base_models])]
43+
for base_models in self.base_models_])
44+
else:
45+
meta_features = np.column_stack([
46+
np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
47+
for base_models in self.base_models_])
48+
if self.use_features_in_secondary:
49+
return self.meta_model_.predict(np.hstack((X, meta_features)))
50+
else:
51+
return self.meta_model_.predict(meta_features)
52+
53+
def predict_proba(self, X):
54+
if self.task_type == 'classification':
55+
meta_features = np.column_stack([[np.argmax(np.bincount(predictions)) for predictions in
56+
np.column_stack([model.predict(X) for model in base_models])]
57+
for base_models in self.base_models_])
58+
if self.use_features_in_secondary:
59+
return self.meta_model_.predict_proba(np.hstack((X, meta_features)))
60+
else:
61+
return self.meta_model_.predict_proba(meta_features)
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# based on https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard
2+
# and https://www.kaggle.com/eikedehling/trying-out-stacking-approaches
3+
from sklearn.base import BaseEstimator, TransformerMixin, clone, RegressorMixin
4+
from sklearn.model_selection import KFold
5+
import numpy as np
6+
7+
8+
class StackingModelsRetrained(BaseEstimator, RegressorMixin, TransformerMixin):
9+
def __init__(self, base_models, meta_model, n_folds=5, use_features_in_secondary=False):
10+
self.base_models = base_models
11+
self.meta_model = meta_model
12+
self.n_folds = n_folds
13+
self.use_features_in_secondary = use_features_in_secondary
14+
15+
def fit(self, X, y):
16+
"""Fit all the models on the given dataset"""
17+
self.base_models_ = [clone(x) for x in self.base_models]
18+
self.meta_model_ = clone(self.meta_model)
19+
kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=42)
20+
21+
# Train cloned base models and create out-of-fold predictions
22+
out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
23+
for i, model in enumerate(self.base_models):
24+
for train_index, holdout_index in kfold.split(X, y):
25+
instance = clone(model)
26+
instance.fit(X[train_index], y[train_index])
27+
y_pred = instance.predict(X[holdout_index])
28+
out_of_fold_predictions[holdout_index, i] = y_pred
29+
30+
if self.use_features_in_secondary:
31+
self.meta_model_.fit(np.hstack((X, out_of_fold_predictions)), y)
32+
else:
33+
self.meta_model_.fit(out_of_fold_predictions, y)
34+
35+
for model in self.base_models_:
36+
model.fit(X, y)
37+
38+
return self
39+
40+
def predict(self, X):
41+
meta_features = np.column_stack([
42+
base_model.predict(X) for base_model in self.base_models_])
43+
if self.use_features_in_secondary:
44+
return self.meta_model_.predict(np.hstack((X, meta_features)))
45+
else:
46+
return self.meta_model_.predict(meta_features)
47+
48+
def predict_proba(self, X):
49+
meta_features = np.column_stack([
50+
base_model.predict(X) for base_model in self.base_models_])
51+
if self.use_features_in_secondary:
52+
return self.meta_model_.predict_proba(np.hstack((X, meta_features)))
53+
else:
54+
return self.meta_model_.predict_proba(meta_features)
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# based on https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard
2+
# and https://www.kaggle.com/eikedehling/trying-out-stacking-approaches
3+
from sklearn.base import BaseEstimator, TransformerMixin, clone, RegressorMixin
4+
import numpy as np
5+
6+
7+
class WeightedAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
8+
def __init__(self, models, weights):
9+
self.models = models
10+
self.weights = weights
11+
assert sum(self.weights) == 1
12+
13+
def fit(self, X, y):
14+
self.models_ = [clone(x) for x in self.models]
15+
16+
# Train cloned base models
17+
for model in self.models_:
18+
model.fit(X, y)
19+
20+
return self
21+
22+
def predict(self, X):
23+
predictions = np.column_stack([
24+
model.predict(X) for model in self.models_
25+
])
26+
return np.sum(predictions * self.weights, axis=1)

0 commit comments

Comments
 (0)