Skip to content

Commit 809b7a7

Browse files
authored
Add files via upload
1 parent d1b9a2d commit 809b7a7

File tree

1 file changed

+70
-0
lines changed

1 file changed

+70
-0
lines changed

Feature Selection.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
2+
# coding: utf-8
3+
4+
# In[31]:
5+
6+
7+
import pickle
8+
import numpy
9+
numpy.random.seed(42)
10+
11+
12+
# In[32]:
13+
14+
15+
word_file = "C:/Users/Geekquad/ud120-projects/feature_selection/word_data_modified_unix.pkl"
16+
author_file = "C:/Users/Geekquad/ud120-projects/feature_selection/email_authors_modified_unix.pkl"
17+
word_data = pickle.load(open(word_file, "rb"))
18+
author_data = pickle.load(open(author_file, "rb"))
19+
20+
21+
# In[33]:
22+
23+
24+
import sklearn
25+
from sklearn.cross_validation import train_test_split
26+
features_train, features_test, labels_train, labels_test = train_test_split(word_data, author_data, test_size=0.1, random_state=42)
27+
28+
29+
# In[34]:
30+
31+
32+
from sklearn.feature_extraction.text import TfidfVectorizer
33+
vect = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words="english")
34+
features_train = vect.fit_transform(features_train)
35+
features_test = vect.transform(features_test).toarray()
36+
37+
38+
# In[37]:
39+
40+
41+
#### training only on 150 data points to put myself into overfit regime
42+
features_train = features_train[:150].toarray()
43+
labels_train = labels_train[:150]
44+
45+
46+
# In[38]:
47+
48+
49+
print('number of training points: ', len(features_train))
50+
51+
52+
# In[45]:
53+
54+
55+
"""overfitting the Decision Tree and cehcking the accuracy"""
56+
from sklearn.tree import DecisionTreeClassifier
57+
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
58+
59+
clf = DecisionTreeClassifier()
60+
clf.fit(features_train, labels_train)
61+
y_pred = clf.predict(features_test)
62+
63+
64+
print(confusion_matrix(labels_test, y_pred))
65+
print(classification_report(labels_test, y_pred))
66+
print(accuracy_score(labels_test, y_pred))
67+
68+
69+
# Yes, it has an accuracy much higher than it should be.
70+
# Hence, finding the most important features.

0 commit comments

Comments
 (0)