Skip to content

Commit 9686a1b

Browse files
committed
Add lab for tweets
1 parent 2d8e20a commit 9686a1b

File tree

2 files changed

+253
-0
lines changed

2 files changed

+253
-0
lines changed
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
import numpy as np
2+
import tensorflow as tf
3+
from utils import TweetLoader, EPS
4+
5+
6+
def length(sequence):
7+
used = tf.sign(tf.reduce_max(tf.abs(sequence), 2))
8+
length = tf.reduce_sum(used, 1)
9+
length = tf.cast(length, tf.int32)
10+
return length
11+
12+
13+
def last_relevant(output, length):
14+
batch_size = tf.shape(output)[0]
15+
max_length = tf.shape(output)[1]
16+
out_size = int(output.get_shape()[2])
17+
index = tf.range(0, batch_size) * max_length + (length - 1)
18+
flat = tf.reshape(output, [-1, out_size])
19+
relevant = tf.gather(flat, index)
20+
return relevant
21+
22+
23+
class TweetModel(object):
24+
25+
def __init__(self, x, targets, hidden_size):
26+
27+
self.x = x
28+
self.targets = targets
29+
self.n_classes = targets.get_shape()[-1]
30+
31+
self.hidden_size = hidden_size
32+
33+
self.inference = None
34+
self.loss = None
35+
self.train_step = None
36+
self.accuracy = None
37+
38+
self.make_inference()
39+
self.make_loss()
40+
self.make_train_step()
41+
self.make_accuracy()
42+
43+
def make_inference(self):
44+
45+
# Create LSTM cell with proper hidden size
46+
cell = tf.contrib.rnn.LSTMCell(self.hidden_size, state_is_tuple=True)
47+
48+
# Get LSTM output
49+
val, _ = tf.nn.dynamic_rnn(cell, x, dtype=tf.float32, sequence_length=length(x))
50+
51+
# Get last output of LSTM
52+
last = last_relevant(val, length(val))
53+
54+
# Define the final prediction applying a fully connected layer with softmax
55+
self.inference = tf.layers.dense(inputs=last, units=self.n_classes, activation=tf.nn.softmax)
56+
57+
def make_loss(self):
58+
self.loss = - tf.reduce_sum(targets * tf.log(self.inference + EPS))
59+
60+
def make_train_step(self):
61+
self.train_step = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(self.loss)
62+
63+
def make_accuracy(self):
64+
mistakes = tf.equal(tf.argmax(self.inference, axis=1), tf.argmax(self.targets, axis=1))
65+
self.accuracy = tf.reduce_mean(tf.cast(mistakes, tf.float32))
66+
67+
68+
if __name__ == '__main__':
69+
70+
max_seq_len = 20
71+
max_dict_size = 1000
72+
hidden_size = 10 # LSTM cell dimension
73+
train_tweets_path = 'data/tweets_train.csv'
74+
val_tweets_path = 'data/tweets_val.csv'
75+
76+
# Training parameters
77+
training_epochs = 20
78+
batch_size = 32
79+
batches_each_epoch = 500
80+
81+
# Get tweet loader
82+
loader = TweetLoader(train_tweets_path, val_tweets_path, batch_size, max_seq_len, max_dict_size)
83+
84+
# Declare placeholders
85+
x = tf.placeholder(dtype=tf.float32, shape=[None, max_seq_len, max_dict_size + 1])
86+
targets = tf.placeholder(dtype=tf.float32, shape=[None, 2])
87+
88+
# Get a model
89+
model = TweetModel(x, targets, hidden_size)
90+
91+
# Open new session
92+
sess = tf.Session()
93+
94+
# Initialize all variables
95+
sess.run(tf.global_variables_initializer())
96+
97+
for epoch in range(training_epochs):
98+
99+
x_batch, y_batch = loader.load_tweet_batch(mode='train')
100+
print('Epoch: {}\tTRAIN: Loss: {:.02f} Accuracy: {:.02f}'.format(
101+
epoch,
102+
sess.run(model.loss, {x: x_batch, targets: y_batch}),
103+
sess.run(model.accuracy, {x: x_batch, targets: y_batch})
104+
))
105+
106+
x_batch, y_batch = loader.load_tweet_batch(mode='val')
107+
print('Epoch: {}\tVAL: Loss: {:.02f} Accuracy: {:.02f}'.format(
108+
epoch,
109+
sess.run(model.loss, {x: x_batch, targets: y_batch}),
110+
sess.run(model.accuracy, {x: x_batch, targets: y_batch})
111+
))
112+
113+
for _ in range(batches_each_epoch):
114+
115+
# Load a batch of training data
116+
x_batch, y_batch = loader.load_tweet_batch(mode='train')
117+
118+
# Actually run one training step here
119+
sess.run(fetches=[model.train_step],
120+
feed_dict={x: x_batch, targets: y_batch})
121+
122+
# Interactive session
123+
while True:
124+
tw = raw_input('Try tweeting something!')
125+
if tw:
126+
x_num = loader.vectorize(tweet=tw)
127+
p, = sess.run([model.inference], feed_dict={x: x_num})
128+
if np.argmax(p) == 0:
129+
# Negative tweet
130+
print('Prediction:{}\t:('.format(p))
131+
else:
132+
print('Prediction:{}\t:)'.format(p))
133+
else:
134+
break
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
"""
2+
Lab utils with tweets loading. Still to make it work.
3+
"""
4+
5+
import re
6+
import csv
7+
import numpy as np
8+
from collections import Counter
9+
from google_drive_downloader import GoogleDriveDownloader
10+
11+
GoogleDriveDownloader.download_file_from_google_drive(file_id='1fHezNVY4YWJVWYb_3P3kx2e9RstjY1OK',
12+
dest_path='data/tweets.zip',
13+
unzip=True)
14+
15+
EPS = np.finfo('float32').eps # machine precision for float32
16+
MAX_TWEET_CHARS = 140 # each tweet is made by max. 140 characters
17+
18+
19+
def preprocess(line):
20+
"""
21+
Pre-process a string of text. Eventually add additional pre-processing here.
22+
"""
23+
line = line.lower() # turn to lowercase
24+
line = line.replace('\n', '') # remove newlines
25+
line = re.sub(r'\W+', ' ', line) # keep characters only (\W is short for [^\w])
26+
27+
return line
28+
29+
30+
def get_dictionary(filename, dict_size=2000):
31+
"""
32+
Read the tweets and return a list of the 'max_words' most common words.
33+
"""
34+
all_words = []
35+
with open(filename, 'r') as csv_file:
36+
r = csv.reader(csv_file, delimiter=',', quotechar='"')
37+
for row in r:
38+
tweet = row[3]
39+
if len(tweet) <= MAX_TWEET_CHARS:
40+
words = preprocess(tweet).split()
41+
all_words += words
42+
43+
# Make the dictionary out of only the N most common words
44+
word_counter = Counter(all_words)
45+
dictionary, _ = zip(*word_counter.most_common(min(dict_size, len(word_counter))))
46+
47+
return dictionary
48+
49+
50+
class TweetLoader(object):
51+
52+
def __init__(self, filename_train, filename_val, batchsize, max_len, dict_size):
53+
54+
self._filename_train = filename_train
55+
self._filename_val = filename_val
56+
self._batchsize = batchsize
57+
self._max_len = max_len
58+
self._dict_size = dict_size
59+
60+
# get the list of words that will constitute our dictionary (once only)
61+
self._dictionary = get_dictionary(self._filename_train, dict_size)
62+
63+
self._train_rows = self.read_data(self._filename_train)
64+
self._val_rows = self.read_data(self._filename_val)
65+
66+
def read_data(self, filename):
67+
# read training data
68+
rows = []
69+
with open(filename, 'r') as csv_file:
70+
reader = csv.reader(csv_file, delimiter=',', quotechar='"')
71+
for row in reader:
72+
rows.append(row)
73+
return rows
74+
75+
def vectorize(self, tweet):
76+
words = preprocess(tweet).split()
77+
78+
X = np.zeros(shape=(1, self._max_len, self._dict_size + 1))
79+
80+
# Vectorization
81+
for j, w in enumerate(words):
82+
if j < self._max_len:
83+
try:
84+
w_idx = self._dictionary.index(w)
85+
X[0, j, w_idx + 1] = 1
86+
except ValueError:
87+
# Word not found, using the unknown
88+
X[0, j, 0] = 1
89+
90+
return X
91+
92+
def load_tweet_batch(self, mode):
93+
"""
94+
Generate a batch of training data
95+
"""
96+
assert mode in ['train', 'val']
97+
if mode == 'train':
98+
rows = self._train_rows
99+
else:
100+
rows = self._val_rows
101+
102+
# prepare data structures
103+
X_batch = np.zeros((self._batchsize, self._max_len, len(self._dictionary) + 1), dtype=np.float32)
104+
Y_batch = np.zeros((self._batchsize, 2), dtype=np.float32)
105+
106+
tweet_loaded = 0
107+
while tweet_loaded < self._batchsize:
108+
109+
rand_idx = np.random.randint(0, len(rows))
110+
Y_batch[tweet_loaded, int(rows[rand_idx][1])] = 1
111+
112+
random_tweet = rows[rand_idx][3]
113+
if len(random_tweet) <= MAX_TWEET_CHARS:
114+
115+
X = self.vectorize(tweet=random_tweet)
116+
X_batch[tweet_loaded] = X[0]
117+
tweet_loaded += 1
118+
119+
return X_batch, Y_batch

0 commit comments

Comments
 (0)