Skip to content

Commit a57dda5

Browse files
author
Algorithmica
authored
Add files via upload
1 parent 92b1d7a commit a57dda5

File tree

2 files changed

+208
-0
lines changed

2 files changed

+208
-0
lines changed
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
import sys
2+
sys.path.append("E:/")
3+
4+
import pandas as pd
5+
import os
6+
import common_utils as utils
7+
from sklearn import preprocessing, neighbors, svm, linear_model, ensemble, pipeline, model_selection, feature_selection
8+
import classification_utils as cutils
9+
import seaborn as sns
10+
import tpot
11+
import numpy as np
12+
13+
dir = 'E:/'
14+
titanic_train = pd.read_csv(os.path.join(dir, 'train.csv'))
15+
print(titanic_train.shape)
16+
print(titanic_train.info())
17+
18+
titanic_test = pd.read_csv(os.path.join(dir, 'test.csv'))
19+
print(titanic_test.shape)
20+
print(titanic_test.info())
21+
titanic_test['Survived'] = None
22+
23+
titanic = pd.concat([titanic_train, titanic_test], ignore_index=True)
24+
print(titanic.shape)
25+
print(titanic.info())
26+
27+
def extract_title(name):
28+
return name.split(',')[1].split('.')[0].strip()
29+
titanic['Title'] = titanic['Name'].map(extract_title)
30+
sns.factorplot(x="Title", hue="Survived", data=titanic, kind="count", size=6)
31+
32+
#create family size feature from sibsp, parch
33+
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch'] + 1
34+
sns.FacetGrid(titanic, hue="Survived",size=8).map(sns.kdeplot, "FamilySize").add_legend()
35+
36+
#create family group feature from family-size
37+
def convert_familysize(size):
38+
if(size == 1):
39+
return 'Single'
40+
elif(size <=5):
41+
return 'Medium'
42+
else:
43+
return 'Large'
44+
titanic['FamilyGroup'] = titanic['FamilySize'].map(convert_familysize)
45+
sns.factorplot(x="FamilyGroup", hue="Survived", data=titanic, kind="count", size=6)
46+
47+
sns.countplot(x='Cabin',data=titanic)
48+
titanic['Cabin'] = titanic['Cabin'].fillna('U')
49+
50+
titanic = utils.drop_features(titanic, ['PassengerId', 'Name', 'Survived', 'Ticket'])
51+
52+
#type casting
53+
utils.cast_to_cat(titanic, ['Sex', 'Pclass', 'Embarked', 'Title', 'FamilyGroup', 'Cabin'])
54+
55+
cat_features = utils.get_categorical_features(titanic)
56+
print(cat_features)
57+
cont_features = utils.get_continuous_features(titanic)
58+
print(cont_features)
59+
60+
#handle missing data(imputation)
61+
cat_imputers = utils.get_categorical_imputers(titanic, cat_features)
62+
titanic[cat_features] = cat_imputers.transform(titanic[cat_features])
63+
cont_imputers = utils.get_continuous_imputers(titanic, cont_features)
64+
titanic[cont_features] = cont_imputers.transform(titanic[cont_features])
65+
66+
#one hot encoding
67+
titanic = utils.ohe(titanic, cat_features)
68+
69+
#scale the data
70+
scaler = preprocessing.StandardScaler()
71+
tmp = scaler.fit_transform(titanic)
72+
titanic = pd.DataFrame(tmp, columns=titanic.columns)
73+
74+
titanic_train1 = titanic[:titanic_train.shape[0]]
75+
y_train = titanic_train['Survived']
76+
77+
rf_estimator = ensemble.RandomForestClassifier()
78+
rf_grid = {'max_depth':list(range(1,9)), 'n_estimators':list(range(1,300,100)) }
79+
rf_final_estimator = cutils.grid_search_best_model(rf_estimator, rf_grid, titanic_train1, y_train)
80+
X_train = utils.select_features(rf_final_estimator, titanic_train1, threshold='mean')
81+
82+
params = {'max_depth': np.arange(1,200,1),
83+
'learning_rate': np.arange(0.0001,0.1,0.0001),
84+
'n_estimators': np.arange(1,200,1),
85+
'nthread':[6],
86+
'gamma':np.arange(0.00001,0.1,0.00001),
87+
'subsample':np.arange(0.1,2,0.1),
88+
'reg_lambda': np.arange(0.1,200,1),
89+
'reg_alpha': np.arange(1,200,1),
90+
'min_child_weight': np.arange(1,200,1),
91+
'colsample_bytree': np.arange(0.1,2,0.1),
92+
'colsample_bylevel': np.arange(0.1,2,0.1)
93+
}
94+
95+
96+
tpot_estimator = tpot.TPOTClassifier(generations=5, population_size=100,
97+
offspring_size=250,
98+
verbosity=2, early_stop=3,
99+
config_dict={'xgboost.XGBClassifier': params},
100+
cv = 5, scoring = 'accuracy')
101+
tpot_estimator.fit(X_train, y_train)
102+
print(tpot_estimator.score(X_train, y_train))
103+
print(tpot_estimator.fitted_pipeline_)
104+
print(tpot_estimator._optimized_pipeline)
105+
print(tpot_estimator.evaluated_individuals_)
106+
107+
titanic_test1 = titanic[titanic_train.shape[0]:]
108+
X_test = utils.select_features(rf_final_estimator, titanic_test1, threshold='mean')
109+
110+
titanic_test['Survived'] = tpot_estimator.predict(X_test)
111+
titanic_test.to_csv(os.path.join(dir, 'submission.csv'), columns=['PassengerId', 'Survived'], index=False)
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import sys
2+
sys.path.append("E:/")
3+
4+
import pandas as pd
5+
import os
6+
import common_utils as utils
7+
from sklearn import preprocessing, neighbors, svm, linear_model, ensemble, pipeline, model_selection, feature_selection
8+
import classification_utils as cutils
9+
import seaborn as sns
10+
import tpot
11+
import numpy as np
12+
13+
dir = 'E:/'
14+
titanic_train = pd.read_csv(os.path.join(dir, 'train.csv'))
15+
print(titanic_train.shape)
16+
print(titanic_train.info())
17+
18+
titanic_test = pd.read_csv(os.path.join(dir, 'test.csv'))
19+
print(titanic_test.shape)
20+
print(titanic_test.info())
21+
titanic_test['Survived'] = None
22+
23+
titanic = pd.concat([titanic_train, titanic_test], ignore_index=True)
24+
print(titanic.shape)
25+
print(titanic.info())
26+
27+
def extract_title(name):
28+
return name.split(',')[1].split('.')[0].strip()
29+
titanic['Title'] = titanic['Name'].map(extract_title)
30+
sns.factorplot(x="Title", hue="Survived", data=titanic, kind="count", size=6)
31+
32+
#create family size feature from sibsp, parch
33+
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch'] + 1
34+
sns.FacetGrid(titanic, hue="Survived",size=8).map(sns.kdeplot, "FamilySize").add_legend()
35+
36+
#create family group feature from family-size
37+
def convert_familysize(size):
38+
if(size == 1):
39+
return 'Single'
40+
elif(size <=5):
41+
return 'Medium'
42+
else:
43+
return 'Large'
44+
titanic['FamilyGroup'] = titanic['FamilySize'].map(convert_familysize)
45+
sns.factorplot(x="FamilyGroup", hue="Survived", data=titanic, kind="count", size=6)
46+
47+
sns.countplot(x='Cabin',data=titanic)
48+
titanic['Cabin'] = titanic['Cabin'].fillna('U')
49+
50+
titanic = utils.drop_features(titanic, ['PassengerId', 'Name', 'Survived', 'Ticket'])
51+
52+
#type casting
53+
utils.cast_to_cat(titanic, ['Sex', 'Pclass', 'Embarked', 'Title', 'FamilyGroup', 'Cabin'])
54+
55+
cat_features = utils.get_categorical_features(titanic)
56+
print(cat_features)
57+
cont_features = utils.get_continuous_features(titanic)
58+
print(cont_features)
59+
60+
#handle missing data(imputation)
61+
cat_imputers = utils.get_categorical_imputers(titanic, cat_features)
62+
titanic[cat_features] = cat_imputers.transform(titanic[cat_features])
63+
cont_imputers = utils.get_continuous_imputers(titanic, cont_features)
64+
titanic[cont_features] = cont_imputers.transform(titanic[cont_features])
65+
66+
#one hot encoding
67+
titanic = utils.ohe(titanic, cat_features)
68+
69+
#scale the data
70+
scaler = preprocessing.StandardScaler()
71+
tmp = scaler.fit_transform(titanic)
72+
titanic = pd.DataFrame(tmp, columns=titanic.columns)
73+
74+
titanic_train1 = titanic[:titanic_train.shape[0]]
75+
y_train = titanic_train['Survived']
76+
77+
rf_estimator = ensemble.RandomForestClassifier()
78+
rf_grid = {'max_depth':list(range(1,9)), 'n_estimators':list(range(1,300,100)) }
79+
rf_final_estimator = cutils.grid_search_best_model(rf_estimator, rf_grid, titanic_train1, y_train)
80+
X_train = utils.select_features(rf_final_estimator, titanic_train1, threshold='mean')
81+
82+
tpot_estimator = tpot.TPOTClassifier(generations=10, population_size=40,
83+
verbosity=2, early_stop=3,
84+
random_state=100,
85+
cv=5, scoring='accuracy',
86+
periodic_checkpoint_folder='E:/checkpoint')
87+
tpot_estimator.fit(X_train, y_train)
88+
print(tpot_estimator.score(X_train, y_train))
89+
print(tpot_estimator.fitted_pipeline_)
90+
print(tpot_estimator._optimized_pipeline)
91+
print(tpot_estimator.evaluated_individuals_)
92+
93+
titanic_test1 = titanic[titanic_train.shape[0]:]
94+
X_test = utils.select_features(rf_final_estimator, titanic_test1, threshold='mean')
95+
96+
titanic_test['Survived'] = tpot_estimator.predict(X_test)
97+
titanic_test.to_csv(os.path.join(dir, 'submission.csv'), columns=['PassengerId', 'Survived'], index=False)

0 commit comments

Comments
 (0)