1+ import sys
2+ sys .path .append ("E:/" )
3+
4+ import pandas as pd
5+ import os
6+ import common_utils as utils
7+ from sklearn import preprocessing , neighbors , svm , linear_model , ensemble , pipeline , model_selection , feature_selection
8+ import classification_utils as cutils
9+ import seaborn as sns
10+ import tpot
11+ import numpy as np
12+
13+ dir = 'E:/'
14+ titanic_train = pd .read_csv (os .path .join (dir , 'train.csv' ))
15+ print (titanic_train .shape )
16+ print (titanic_train .info ())
17+
18+ titanic_test = pd .read_csv (os .path .join (dir , 'test.csv' ))
19+ print (titanic_test .shape )
20+ print (titanic_test .info ())
21+ titanic_test ['Survived' ] = None
22+
23+ titanic = pd .concat ([titanic_train , titanic_test ], ignore_index = True )
24+ print (titanic .shape )
25+ print (titanic .info ())
26+
27+ def extract_title (name ):
28+ return name .split (',' )[1 ].split ('.' )[0 ].strip ()
29+ titanic ['Title' ] = titanic ['Name' ].map (extract_title )
30+ sns .factorplot (x = "Title" , hue = "Survived" , data = titanic , kind = "count" , size = 6 )
31+
32+ #create family size feature from sibsp, parch
33+ titanic ['FamilySize' ] = titanic ['SibSp' ] + titanic ['Parch' ] + 1
34+ sns .FacetGrid (titanic , hue = "Survived" ,size = 8 ).map (sns .kdeplot , "FamilySize" ).add_legend ()
35+
36+ #create family group feature from family-size
37+ def convert_familysize (size ):
38+ if (size == 1 ):
39+ return 'Single'
40+ elif (size <= 5 ):
41+ return 'Medium'
42+ else :
43+ return 'Large'
44+ titanic ['FamilyGroup' ] = titanic ['FamilySize' ].map (convert_familysize )
45+ sns .factorplot (x = "FamilyGroup" , hue = "Survived" , data = titanic , kind = "count" , size = 6 )
46+
47+ sns .countplot (x = 'Cabin' ,data = titanic )
48+ titanic ['Cabin' ] = titanic ['Cabin' ].fillna ('U' )
49+
50+ titanic = utils .drop_features (titanic , ['PassengerId' , 'Name' , 'Survived' , 'Ticket' ])
51+
52+ #type casting
53+ utils .cast_to_cat (titanic , ['Sex' , 'Pclass' , 'Embarked' , 'Title' , 'FamilyGroup' , 'Cabin' ])
54+
55+ cat_features = utils .get_categorical_features (titanic )
56+ print (cat_features )
57+ cont_features = utils .get_continuous_features (titanic )
58+ print (cont_features )
59+
60+ #handle missing data(imputation)
61+ cat_imputers = utils .get_categorical_imputers (titanic , cat_features )
62+ titanic [cat_features ] = cat_imputers .transform (titanic [cat_features ])
63+ cont_imputers = utils .get_continuous_imputers (titanic , cont_features )
64+ titanic [cont_features ] = cont_imputers .transform (titanic [cont_features ])
65+
66+ #one hot encoding
67+ titanic = utils .ohe (titanic , cat_features )
68+
69+ #scale the data
70+ scaler = preprocessing .StandardScaler ()
71+ tmp = scaler .fit_transform (titanic )
72+ titanic = pd .DataFrame (tmp , columns = titanic .columns )
73+
74+ titanic_train1 = titanic [:titanic_train .shape [0 ]]
75+ y_train = titanic_train ['Survived' ]
76+
77+ rf_estimator = ensemble .RandomForestClassifier ()
78+ rf_grid = {'max_depth' :list (range (1 ,9 )), 'n_estimators' :list (range (1 ,300 ,100 )) }
79+ rf_final_estimator = cutils .grid_search_best_model (rf_estimator , rf_grid , titanic_train1 , y_train )
80+ X_train = utils .select_features (rf_final_estimator , titanic_train1 , threshold = 'mean' )
81+
82+ params = {'max_depth' : np .arange (1 ,200 ,1 ),
83+ 'learning_rate' : np .arange (0.0001 ,0.1 ,0.0001 ),
84+ 'n_estimators' : np .arange (1 ,200 ,1 ),
85+ 'nthread' :[6 ],
86+ 'gamma' :np .arange (0.00001 ,0.1 ,0.00001 ),
87+ 'subsample' :np .arange (0.1 ,2 ,0.1 ),
88+ 'reg_lambda' : np .arange (0.1 ,200 ,1 ),
89+ 'reg_alpha' : np .arange (1 ,200 ,1 ),
90+ 'min_child_weight' : np .arange (1 ,200 ,1 ),
91+ 'colsample_bytree' : np .arange (0.1 ,2 ,0.1 ),
92+ 'colsample_bylevel' : np .arange (0.1 ,2 ,0.1 )
93+ }
94+
95+
96+ tpot_estimator = tpot .TPOTClassifier (generations = 5 , population_size = 100 ,
97+ offspring_size = 250 ,
98+ verbosity = 2 , early_stop = 3 ,
99+ config_dict = {'xgboost.XGBClassifier' : params },
100+ cv = 5 , scoring = 'accuracy' )
101+ tpot_estimator .fit (X_train , y_train )
102+ print (tpot_estimator .score (X_train , y_train ))
103+ print (tpot_estimator .fitted_pipeline_ )
104+ print (tpot_estimator ._optimized_pipeline )
105+ print (tpot_estimator .evaluated_individuals_ )
106+
107+ titanic_test1 = titanic [titanic_train .shape [0 ]:]
108+ X_test = utils .select_features (rf_final_estimator , titanic_test1 , threshold = 'mean' )
109+
110+ titanic_test ['Survived' ] = tpot_estimator .predict (X_test )
111+ titanic_test .to_csv (os .path .join (dir , 'submission.csv' ), columns = ['PassengerId' , 'Survived' ], index = False )
0 commit comments