Skip to content

Commit 14848f4

Browse files
committed
added spam classifier
1 parent 5a53324 commit 14848f4

File tree

12 files changed

+5803
-0
lines changed

12 files changed

+5803
-0
lines changed
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# [How to Build a Spam Classifier using Keras in Python](https://www.thepythoncode.com/article/build-spam-classifier-keras-python)
2+
To run this:
3+
- `pip3 install -r requirements.txt`
4+
- For training, since we're using transfer learning, you first need to download, extract [GloVe](http://nlp.stanford.edu/data/glove.6B.zip) and put to `data` folder, this is a pre trained embedding vectors that map each word to its vector, two words that have similar meanings tend to have very close vectors, and so on.
5+
```
6+
python3 spam_classifier.py
7+
```
8+
This will spawn tensorflow logs in `logs` folder, as well as the model and tokenizer in `results`, so `test.py` will use them.
9+
- After the training has finished, try testing your own emails, or change the code on your needs, or whatever:
10+
```
11+
python3 test.py
12+
```

machine-learning/nlp/spam-classifier/data/SMSSpamCollection

Lines changed: 5574 additions & 0 deletions
Large diffs are not rendered by default.
1.31 MB
Binary file not shown.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
sklearn
2+
keras
3+
tqdm
4+
numpy
5+
keras_metrics
4.36 MB
Binary file not shown.
4.36 MB
Binary file not shown.
4.36 MB
Binary file not shown.
4.36 MB
Binary file not shown.
404 KB
Binary file not shown.
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
# to use CPU uncomment below code
2+
# import os
3+
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152
4+
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
5+
6+
# import tensorflow as tf
7+
8+
# config = tf.ConfigProto(intra_op_parallelism_threads=5,
9+
# inter_op_parallelism_threads=5,
10+
# allow_soft_placement=True,
11+
# device_count = {'CPU' : 1,
12+
# 'GPU' : 0}
13+
# )
14+
15+
16+
from keras.preprocessing.text import Tokenizer
17+
from keras.preprocessing.sequence import pad_sequences
18+
from keras.utils import to_categorical
19+
from keras.callbacks import ModelCheckpoint, TensorBoard
20+
from sklearn.model_selection import train_test_split
21+
import time
22+
import numpy as np
23+
import pickle
24+
25+
from utils import get_embedding_vectors, get_model, SEQUENCE_LENGTH, EMBEDDING_SIZE, TEST_SIZE
26+
from utils import BATCH_SIZE, EPOCHS, int2label, label2int
27+
28+
29+
def load_data():
30+
"""
31+
Loads SMS Spam Collection dataset
32+
"""
33+
texts, labels = [], []
34+
with open("data/SMSSpamCollection") as f:
35+
for line in f:
36+
split = line.split()
37+
labels.append(split[0].strip())
38+
texts.append(' '.join(split[1:]).strip())
39+
return texts, labels
40+
41+
42+
# load the data
43+
X, y = load_data()
44+
45+
# Text tokenization
46+
# vectorizing text, turning each text into sequence of integers
47+
tokenizer = Tokenizer()
48+
tokenizer.fit_on_texts(X)
49+
# lets dump it to a file, so we can use it in testing
50+
pickle.dump(tokenizer, open("results/tokenizer.pickle", "wb"))
51+
52+
# convert to sequence of integers
53+
X = tokenizer.texts_to_sequences(X)
54+
print(X[0])
55+
# convert to numpy arrays
56+
X = np.array(X)
57+
y = np.array(y)
58+
# pad sequences at the beginning of each sequence with 0's
59+
# for example if SEQUENCE_LENGTH=4:
60+
# [[5, 3, 2], [5, 1, 2, 3], [3, 4]]
61+
# will be transformed to:
62+
# [[0, 5, 3, 2], [5, 1, 2, 3], [0, 0, 3, 4]]
63+
X = pad_sequences(X, maxlen=SEQUENCE_LENGTH)
64+
print(X[0])
65+
# One Hot encoding labels
66+
# [spam, ham, spam, ham, ham] will be converted to:
67+
# [1, 0, 1, 0, 1] and then to:
68+
# [[0, 1], [1, 0], [0, 1], [1, 0], [0, 1]]
69+
70+
y = [ label2int[label] for label in y ]
71+
y = to_categorical(y)
72+
73+
print(y[0])
74+
75+
# split and shuffle
76+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=7)
77+
78+
# constructs the model with 128 LSTM units
79+
model = get_model(tokenizer=tokenizer, lstm_units=128)
80+
81+
# initialize our ModelCheckpoint and TensorBoard callbacks
82+
# model checkpoint for saving best weights
83+
model_checkpoint = ModelCheckpoint("results/spam_classifier_{val_loss:.2f}", save_best_only=True,
84+
verbose=1)
85+
# for better visualization
86+
tensorboard = TensorBoard(f"logs/spam_classifier_{time.time()}")
87+
# print our data shapes
88+
print("X_train.shape:", X_train.shape)
89+
print("X_test.shape:", X_test.shape)
90+
print("y_train.shape:", y_train.shape)
91+
print("y_test.shape:", y_test.shape)
92+
# train the model
93+
model.fit(X_train, y_train, validation_data=(X_test, y_test),
94+
batch_size=BATCH_SIZE, epochs=EPOCHS,
95+
callbacks=[tensorboard, model_checkpoint],
96+
verbose=1)
97+
98+
# get the loss and metrics
99+
result = model.evaluate(X_test, y_test)
100+
# extract those
101+
loss = result[0]
102+
accuracy = result[1]
103+
precision = result[2]
104+
recall = result[3]
105+
106+
print(f"[+] Accuracy: {accuracy*100:.2f}%")
107+
print(f"[+] Precision: {precision*100:.2f}%")
108+
print(f"[+] Recall: {recall*100:.2f}%")
109+
110+
111+

0 commit comments

Comments
 (0)