Skip to content

Commit c2e53a0

Browse files
author
Algorithmica
authored
Add files via upload
1 parent b3c28be commit c2e53a0

7 files changed

+845
-0
lines changed
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
import sys
2+
path = 'E://utils'
3+
sys.path.append(path)
4+
5+
import common_utils as utils
6+
import regression_utils as rutils
7+
from sklearn import metrics, model_selection, neighbors, linear_model, decomposition, manifold
8+
import math
9+
import pandas as pd
10+
import os
11+
import seaborn as sns
12+
import numpy as np
13+
14+
def log_rmse(y_orig, y_pred):
15+
return math.sqrt(metrics.mean_squared_log_error(y_orig,y_pred) )
16+
17+
def rmse(y_orig, y_pred):
18+
return math.sqrt(metrics.mean_squared_error(y_orig,y_pred) )
19+
20+
21+
path = 'E://'
22+
house_train = pd.read_csv(os.path.join(path,"house_train.csv"))
23+
house_train.shape
24+
house_train.info()
25+
26+
house_test = pd.read_csv(os.path.join(path,"house_test.csv"))
27+
house_test.shape
28+
house_test.info()
29+
30+
house = pd.concat((house_train, house_test), axis = 0)
31+
house.shape
32+
house.info()
33+
34+
print(utils.get_continuous_features(house))
35+
print(utils.get_non_continuous_features(house))
36+
37+
sns.countplot(x='YearBuilt',data=house_train)
38+
sns.jointplot(x="SalePrice", y="YearBuilt", data=house_train)
39+
sns.FacetGrid(house_train, hue="YearBuilt",size=8).map(sns.kdeplot, "SalePrice").add_legend()
40+
41+
sns.countplot(x='YrSold',data=house_train)
42+
sns.jointplot(x="SalePrice", y="YrSold", data=house_train)
43+
sns.FacetGrid(house_train, hue="YrSold",size=8).map(sns.kdeplot, "SalePrice").add_legend()
44+
45+
features_to_cast = ['MSSubClass']
46+
utils.cast_to_cat(house, features_to_cast)
47+
48+
features_to_drop = ['Id']
49+
missing_features_above_th = utils.get_features_to_drop_on_missingdata(house, 0.25)
50+
features_to_drop.extend(missing_features_above_th)
51+
house1 = utils.drop_features(house, features_to_drop)
52+
house1.info()
53+
54+
imputable_cat_features = utils.get_non_continuous_features(house1)
55+
cat_imputer = utils.get_categorical_imputers(house1, imputable_cat_features)
56+
house1[imputable_cat_features] = cat_imputer.transform(house1[imputable_cat_features])
57+
58+
imputable_cont_features = utils.get_continuous_features(house1)
59+
cont_imputer = utils.get_continuous_imputers(house1, imputable_cont_features)
60+
house1[imputable_cont_features] = cont_imputer.transform(house1[imputable_cont_features])
61+
house1.info()
62+
63+
house2 = utils.ohe(house1, imputable_cat_features)
64+
65+
scaler = utils.get_scaler(house2)
66+
house3 = scaler.transform(house2)
67+
house3 = pd.DataFrame(house3, columns=house2.columns)
68+
69+
X_train = house3[:house_train.shape[0]]
70+
y_train = house_train['SalePrice']
71+
72+
lasso_selector = linear_model.Lasso()
73+
lasso_selector.fit(X_train, y_train)
74+
print(lasso_selector.coef_)
75+
utils.plot_feature_importances(lasso_selector, X_train, 40)
76+
77+
X_train1 = utils.select_features(lasso_selector, X_train)
78+
79+
utils.corr_heatmap(X_train1)
80+
lpca = decomposition.PCA(X_train1.shape[1])
81+
lpca.fit(X_train1)
82+
print(np.cumsum(lpca.explained_variance_ratio_))
83+
pca_data = lpca.transform(X_train1)
84+
85+
tsne = manifold.TSNE(n_components=2)
86+
tsne_data = tsne.fit_transform(pca_data)
87+
rutils.plot_data_3d_regression(tsne_data, y_train)
88+
89+
scoring = metrics.make_scorer(log_rmse, greater_is_better=False)
90+
91+
knn_estimator = neighbors.KNeighborsRegressor()
92+
knn_grid = {'n_neighbors':list(range(5,15)) }
93+
final_model = utils.grid_search_best_model(knn_estimator, knn_grid, pca_data, y_train, scoring=scoring)
94+
95+
X_test = house3[house_train.shape[0]:]
96+
X_test1 = utils.select_features(lasso_selector, X_test)
97+
pca_test_data = lpca.transform(X_test1)
98+
pca_test_data.shape
99+
100+
house_test['SalePrice'] = final_model.predict(X_test1)
101+
house_test.to_csv("C:\\Users\\Algorithmica\\Downloads\\submission.csv", columns=["Id", "SalePrice"], index=False)
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
#knn
2+
import sys
3+
path = 'E://utils'
4+
sys.path.append(path)
5+
6+
import common_utils as utils
7+
import regression_utils as rutils
8+
from sklearn import metrics, model_selection, neighbors, linear_model, decomposition, manifold
9+
import math
10+
import pandas as pd
11+
import os
12+
import seaborn as sns
13+
import numpy as np
14+
15+
def log_rmse(y_orig, y_pred):
16+
return math.sqrt(metrics.mean_squared_log_error(y_orig,y_pred) )
17+
18+
def rmse(y_orig, y_pred):
19+
return math.sqrt(metrics.mean_squared_error(y_orig,y_pred) )
20+
21+
22+
path = 'E://'
23+
house_train = pd.read_csv(os.path.join(path,"house_train.csv"))
24+
house_train.shape
25+
house_train.info()
26+
27+
house_test = pd.read_csv(os.path.join(path,"house_test.csv"))
28+
house_test.shape
29+
house_test.info()
30+
31+
house = pd.concat((house_train, house_test), axis = 0)
32+
house.shape
33+
house.info()
34+
35+
print(utils.get_continuous_features(house))
36+
print(utils.get_non_continuous_features(house))
37+
38+
sns.countplot(x='YearBuilt',data=house_train)
39+
sns.jointplot(x="SalePrice", y="YearBuilt", data=house_train)
40+
sns.FacetGrid(house_train, hue="YearBuilt",size=8).map(sns.kdeplot, "SalePrice").add_legend()
41+
42+
sns.countplot(x='YrSold',data=house_train)
43+
sns.jointplot(x="SalePrice", y="YrSold", data=house_train)
44+
sns.FacetGrid(house_train, hue="YrSold",size=8).map(sns.kdeplot, "SalePrice").add_legend()
45+
46+
features_to_cast = ['MSSubClass']
47+
utils.cast_to_cat(house, features_to_cast)
48+
49+
features_to_drop = ['Id']
50+
missing_features_above_th = utils.get_features_to_drop_on_missingdata(house, 0.25)
51+
features_to_drop.extend(missing_features_above_th)
52+
house1 = utils.drop_features(house, features_to_drop)
53+
house1.info()
54+
55+
imputable_cat_features = utils.get_non_continuous_features(house1)
56+
cat_imputer = utils.get_categorical_imputers(house1, imputable_cat_features)
57+
house1[imputable_cat_features] = cat_imputer.transform(house1[imputable_cat_features])
58+
59+
imputable_cont_features = utils.get_continuous_features(house1)
60+
cont_imputer = utils.get_continuous_imputers(house1, imputable_cont_features)
61+
house1[imputable_cont_features] = cont_imputer.transform(house1[imputable_cont_features])
62+
house1.info()
63+
64+
house2 = utils.ohe(house1, imputable_cat_features)
65+
66+
scaler = utils.get_scaler(house2)
67+
house3 = scaler.transform(house2)
68+
house3 = pd.DataFrame(house3, columns=house2.columns)
69+
70+
X_train = house3[:house_train.shape[0]]
71+
y_train = house_train['SalePrice']
72+
73+
utils.corr_heatmap(X_train)
74+
lpca = decomposition.PCA()
75+
lpca.fit(X_train)
76+
print(np.cumsum(lpca.explained_variance_ratio_))
77+
pca_data = lpca.transform(X_train)
78+
print(pca_data.shape)
79+
pca_data = pd.DataFrame(pca_data)
80+
81+
lasso_selector = linear_model.Lasso()
82+
lasso_selector.fit(pca_data, y_train)
83+
print(lasso_selector.coef_)
84+
utils.plot_feature_importances(lasso_selector, pca_data, 40)
85+
86+
X_train1 = utils.select_features(lasso_selector, pca_data)
87+
print(X_train1.shape)
88+
89+
tsne = manifold.TSNE(n_components=2)
90+
tsne_data = tsne.fit_transform(X_train1)
91+
rutils.plot_data_3d_regression(tsne_data, y_train)
92+
93+
scoring = metrics.make_scorer(log_rmse, greater_is_better=False)
94+
95+
knn_estimator = neighbors.KNeighborsRegressor()
96+
knn_grid = {'n_neighbors':list(range(5,15)) }
97+
final_model = utils.grid_search_best_model(knn_estimator, knn_grid, X_train1, y_train, scoring=scoring)
98+
99+
X_test = house3[house_train.shape[0]:]
100+
pca_test_data = lpca.transform(X_test)
101+
X_test1 = utils.select_features(lasso_selector, pd.DataFrame(pca_test_data))
102+
print(X_test1.shape)
103+
104+
house_test['SalePrice'] = final_model.predict(X_test1)
105+
house_test.to_csv("C:\\Users\\Algorithmica\\Downloads\\submission.csv", columns=["Id", "SalePrice"], index=False)
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
#with log transformation of target and knn
2+
3+
import sys
4+
path = 'E://utils'
5+
sys.path.append(path)
6+
7+
import common_utils as utils
8+
import regression_utils as rutils
9+
from sklearn import metrics, model_selection, neighbors, linear_model, decomposition, manifold
10+
import math
11+
import pandas as pd
12+
import os
13+
import seaborn as sns
14+
import numpy as np
15+
16+
def log_rmse(y_orig, y_pred):
17+
return math.sqrt(metrics.mean_squared_log_error(y_orig,y_pred) )
18+
19+
def rmse(y_orig, y_pred):
20+
return math.sqrt(metrics.mean_squared_error(y_orig,y_pred) )
21+
22+
23+
path = 'E://'
24+
house_train = pd.read_csv(os.path.join(path,"house_train.csv"))
25+
house_train.shape
26+
house_train.info()
27+
28+
house_test = pd.read_csv(os.path.join(path,"house_test.csv"))
29+
house_test.shape
30+
house_test.info()
31+
32+
house = pd.concat((house_train, house_test), axis = 0)
33+
house.shape
34+
house.info()
35+
36+
print(utils.get_continuous_features(house))
37+
print(utils.get_non_continuous_features(house))
38+
39+
sns.countplot(x='YearBuilt',data=house_train)
40+
sns.jointplot(x="SalePrice", y="YearBuilt", data=house_train)
41+
sns.FacetGrid(house_train, hue="YearBuilt",size=8).map(sns.kdeplot, "SalePrice").add_legend()
42+
43+
sns.countplot(x='YrSold',data=house_train)
44+
sns.jointplot(x="SalePrice", y="YrSold", data=house_train)
45+
sns.FacetGrid(house_train, hue="YrSold",size=8).map(sns.kdeplot, "SalePrice").add_legend()
46+
47+
features_to_cast = ['MSSubClass']
48+
utils.cast_to_cat(house, features_to_cast)
49+
50+
features_to_drop = ['Id']
51+
missing_features_above_th = utils.get_features_to_drop_on_missingdata(house, 0.25)
52+
features_to_drop.extend(missing_features_above_th)
53+
house1 = utils.drop_features(house, features_to_drop)
54+
house1.info()
55+
56+
imputable_cat_features = utils.get_non_continuous_features(house1)
57+
cat_imputer = utils.get_categorical_imputers(house1, imputable_cat_features)
58+
house1[imputable_cat_features] = cat_imputer.transform(house1[imputable_cat_features])
59+
60+
imputable_cont_features = utils.get_continuous_features(house1)
61+
cont_imputer = utils.get_continuous_imputers(house1, imputable_cont_features)
62+
house1[imputable_cont_features] = cont_imputer.transform(house1[imputable_cont_features])
63+
house1.info()
64+
65+
house2 = utils.ohe(house1, imputable_cat_features)
66+
67+
scaler = utils.get_scaler(house2)
68+
house3 = scaler.transform(house2)
69+
house3 = pd.DataFrame(house3, columns=house2.columns)
70+
71+
X_train = house3[:house_train.shape[0]]
72+
y_train = house_train['SalePrice']
73+
74+
lasso_selector = linear_model.Lasso()
75+
lasso_selector.fit(X_train, y_train)
76+
print(lasso_selector.coef_)
77+
utils.plot_feature_importances(lasso_selector, X_train, 40)
78+
79+
X_train1 = utils.select_features(lasso_selector, X_train)
80+
81+
utils.corr_heatmap(X_train1)
82+
lpca = decomposition.PCA(0.95)
83+
lpca.fit(X_train1)
84+
print(np.cumsum(lpca.explained_variance_ratio_))
85+
pca_data = lpca.transform(X_train1)
86+
print(pca_data.shape)
87+
88+
tsne = manifold.TSNE(n_components=2)
89+
tsne_data = tsne.fit_transform(pca_data)
90+
rutils.plot_data_3d_regression(tsne_data, y_train)
91+
92+
scoring = metrics.make_scorer(log_rmse, greater_is_better=False)
93+
94+
sns.distplot(y_train)
95+
y_trans = np.log1p(y_train)
96+
sns.distplot(y_trans)
97+
98+
knn_estimator = neighbors.KNeighborsRegressor()
99+
knn_grid = {'n_neighbors':list(range(5,15)) }
100+
final_model = utils.grid_search_best_model(knn_estimator, knn_grid, pca_data, y_trans, scoring=scoring)
101+
102+
X_test = house3[house_train.shape[0]:]
103+
X_test1 = utils.select_features(lasso_selector, X_test)
104+
pca_test_data = lpca.transform(X_test1)
105+
pca_test_data.shape
106+
107+
house_test['SalePrice'] = np.expm1(final_model.predict(pca_test_data))
108+
house_test.to_csv("C:\\Users\\Algorithmica\\Downloads\\submission.csv", columns=["Id", "SalePrice"], index=False)

0 commit comments

Comments
 (0)