Skip to content

Commit 8a98d5f

Browse files
committed
Refactored
1 parent f161ed9 commit 8a98d5f

File tree

7 files changed

+4652
-50
lines changed

7 files changed

+4652
-50
lines changed

Module 4 - Machine Learning/02. Working with Text Data/4. Advance Text Vectorization/advance_text_vectorization.ipynb

Lines changed: 47 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
"id": "7c0a730a-2cc5-4292-924c-50ef49c29faa",
66
"metadata": {},
77
"source": [
8-
"# **Advance Text Vectorization - Word2Vec, Pretrained GloVe and BERT**\n",
8+
"# **Advance Text Vectorization - Word2Vec, Pretrained GloVe and Pretrained BERT**\n",
99
"\n",
1010
"### **What's Covered?**\n",
1111
"1. Text Vectorization\n",
@@ -411,9 +411,9 @@
411411
"source": [
412412
"from gensim.models import Word2Vec\n",
413413
"\n",
414-
"word2vec_model = Word2Vec(df['tokenised_sentences'], vector_size=100, min_count=1)\n",
414+
"word2vec_vect = Word2Vec(df['tokenised_sentences'], vector_size=100, min_count=1)\n",
415415
"\n",
416-
"print(word2vec_model)"
416+
"print(word2vec_vect)"
417417
]
418418
},
419419
{
@@ -439,17 +439,17 @@
439439
}
440440
],
441441
"source": [
442-
"# We can look at unique words by using 'vocabulary_'\n",
442+
"# We can check out what is learned by \"word2vec_vect\"\n",
443443
"\n",
444-
"print(f\"Number of documents used for Training: {word2vec_model.corpus_count}\")\n",
444+
"print(f\"Number of documents used for Training: {word2vec_vect.corpus_count}\")\n",
445445
"print()\n",
446-
"print(f\"Vocabulary size: {len(word2vec_model.wv.index_to_key)}\")\n",
446+
"print(f\"Vocabulary size: {len(word2vec_vect.wv.index_to_key)}\")\n",
447447
"print()\n",
448-
"print(f\"Vocabulary: {word2vec_model.wv.index_to_key}\")\n",
448+
"print(f\"Vocabulary: {word2vec_vect.wv.index_to_key}\")\n",
449449
"print()\n",
450-
"print(f\"Let's look at the vocabulary stored in the object: {word2vec_model.wv.key_to_index}\")\n",
450+
"print(f\"Let's look at the vocabulary stored in the object: {word2vec_vect.wv.key_to_index}\")\n",
451451
"print()\n",
452-
"print(f\"Vector Size: {word2vec_model.vector_size}\")"
452+
"print(f\"Vector Size: {word2vec_vect.vector_size}\")"
453453
]
454454
},
455455
{
@@ -468,7 +468,7 @@
468468
],
469469
"source": [
470470
"# Word frequencies\n",
471-
"word_frequencies = {word: word2vec_model.wv.get_vecattr(word, \"count\") for word in word2vec_model.wv.index_to_key}\n",
471+
"word_frequencies = {word: word2vec_vect.wv.get_vecattr(word, \"count\") for word in word2vec_vect.wv.index_to_key}\n",
472472
"\n",
473473
"print(f\"Word frequencies: {list(word_frequencies.items())[:10]}\")"
474474
]
@@ -516,12 +516,12 @@
516516
"source": [
517517
"# Getting vector for a word\n",
518518
"\n",
519-
"print(f\"Word Embedding Shape: { word2vec_model.wv['time'].shape }\")\n",
519+
"print(f\"Word Embedding Shape: { word2vec_vect.wv['time'].shape }\")\n",
520520
"print()\n",
521-
"print(word2vec_model.wv[\"time\"])\n",
521+
"print(word2vec_vect.wv[\"time\"])\n",
522522
"\n",
523523
"# # We can also use the following:\n",
524-
"# print(word2vec_model.wv.__getitem__('time'))"
524+
"# print(word2vec_vect.wv.__getitem__('time'))"
525525
]
526526
},
527527
{
@@ -692,9 +692,9 @@
692692
"source": [
693693
"# Access the 100D vectors for all 7 words\n",
694694
"\n",
695-
"print(f\"Shape: { word2vec_model.wv[word2vec_model.wv.index_to_key].shape }\")\n",
695+
"print(f\"Shape: { word2vec_vect.wv[word2vec_vect.wv.index_to_key].shape }\")\n",
696696
"print()\n",
697-
"print(word2vec_model.wv[word2vec_model.wv.index_to_key])"
697+
"print(word2vec_vect.wv[word2vec_vect.wv.index_to_key])"
698698
]
699699
},
700700
{
@@ -705,11 +705,11 @@
705705
"outputs": [],
706706
"source": [
707707
"# save model\n",
708-
"# word2vec_model.save('model/first_word_vectors.bin')\n",
708+
"# word2vec_vect.save('model/first_word_vectors.bin')\n",
709709
"\n",
710710
"# # load model\n",
711-
"# word2vec_model = Word2Vec.load('model/first_word_vectors.bin')\n",
712-
"# print(word2vec_model)"
711+
"# word2vec_vect = Word2Vec.load('model/first_word_vectors.bin')\n",
712+
"# print(word2vec_vect)"
713713
]
714714
},
715715
{
@@ -735,13 +735,13 @@
735735
"from sklearn.decomposition import PCA\n",
736736
"import matplotlib.pyplot as plt\n",
737737
"\n",
738-
"X = word2vec_model.wv[word2vec_model.wv.index_to_key]\n",
738+
"X = word2vec_vect.wv[word2vec_vect.wv.index_to_key]\n",
739739
"pca = PCA(n_components = 2)\n",
740740
"result = pca.fit_transform(X)\n",
741741
"\n",
742742
"# create a scatter plot of the projection\n",
743743
"plt.scatter(result[:, 0], result[:, 1])\n",
744-
"words = list(word2vec_model.wv.index_to_key)\n",
744+
"words = list(word2vec_vect.wv.index_to_key)\n",
745745
"for i, word in enumerate(words):\n",
746746
" plt.annotate(word, xy=(result[i, 0], result[i, 1]))\n",
747747
"plt.show()"
@@ -763,7 +763,7 @@
763763
],
764764
"source": [
765765
"# Find most similar words\n",
766-
"similar_words = word2vec_model.wv.most_similar('time', topn=1)\n",
766+
"similar_words = word2vec_vect.wv.most_similar('time', topn=1)\n",
767767
"\n",
768768
"print(similar_words)"
769769
]
@@ -788,7 +788,7 @@
788788
"source": [
789789
"# Computing the similarity between word vectors\n",
790790
"\n",
791-
"word2vec_model.wv.similarity('best', 'worst')"
791+
"word2vec_vect.wv.similarity('best', 'worst')"
792792
]
793793
},
794794
{
@@ -821,7 +821,7 @@
821821
"\n",
822822
"sentence = ['best', 'bansal', 'time', 'kanav']\n",
823823
"\n",
824-
"vocab_tokens = [word for word in sentence if word in word2vec_model.wv.index_to_key]\n",
824+
"vocab_tokens = [word for word in sentence if word in word2vec_vect.wv.index_to_key]\n",
825825
"\n",
826826
"vocab_tokens"
827827
]
@@ -870,7 +870,7 @@
870870
],
871871
"source": [
872872
"# Create document vectors by averaging word vectors\n",
873-
"np.mean(word2vec_model.wv[vocab_tokens], axis=0)"
873+
"np.mean(word2vec_vect.wv[vocab_tokens], axis=0)"
874874
]
875875
},
876876
{
@@ -977,7 +977,7 @@
977977
}
978978
],
979979
"source": [
980-
"df['w2v_doc_embeddings'] = df[\"tokenised_sentences\"].apply(lambda doc : get_document_vector(doc, word2vec_model.wv))\n",
980+
"df['w2v_doc_embeddings'] = df[\"tokenised_sentences\"].apply(lambda doc : get_document_vector(doc, word2vec_vect.wv))\n",
981981
"\n",
982982
"df.head()"
983983
]
@@ -1020,11 +1020,11 @@
10201020
"# # For the sake of this example, we will be loading pre-trained GloVe model\n",
10211021
"# # You can also choose a Word2Vec or a FastText model as well\n",
10221022
"\n",
1023-
"# glove_model = api.load('glove-twitter-50')\n",
1023+
"# glove_vect = api.load('glove-twitter-50')\n",
10241024
"\n",
10251025
"# # Approax 200MB Size\n",
10261026
"# # Save Embeddings\n",
1027-
"# glove_model.save('pretrained_models/.50d_glove_vec.kv')"
1027+
"# glove_vect.save('pretrained_models/.50d_glove_vec.kv')"
10281028
]
10291029
},
10301030
{
@@ -1045,9 +1045,9 @@
10451045
"# Load Embeddings\n",
10461046
"from gensim.models import KeyedVectors\n",
10471047
"\n",
1048-
"glove_model = KeyedVectors.load('pretrained_models/.50d_glove_vec.kv')\n",
1048+
"glove_vect = KeyedVectors.load('pretrained_models/.50d_glove_vec.kv')\n",
10491049
"\n",
1050-
"print(type(glove_model))"
1050+
"print(type(glove_vect))"
10511051
]
10521052
},
10531053
{
@@ -1068,8 +1068,8 @@
10681068
"source": [
10691069
"# Vocabulary Size and Word Embedding Shape\n",
10701070
"\n",
1071-
"print(f\"Vocabulary size: {len(glove_model.index_to_key)}\")\n",
1072-
"print(f\"Vector Size: {glove_model.vector_size}\")"
1071+
"print(f\"Vocabulary size: {len(glove_vect.index_to_key)}\")\n",
1072+
"print(f\"Vector Size: {glove_vect.vector_size}\")"
10731073
]
10741074
},
10751075
{
@@ -1098,9 +1098,9 @@
10981098
"source": [
10991099
"# Getting vector for a word\n",
11001100
"\n",
1101-
"print(f\"Word Embedding Shape: { glove_model['college'].shape }\")\n",
1101+
"print(f\"Word Embedding Shape: { glove_vect['college'].shape }\")\n",
11021102
"print()\n",
1103-
"print(glove_model['college'])\n",
1103+
"print(glove_vect['college'])\n",
11041104
"\n",
11051105
"# # We can also use the following:\n",
11061106
"# print(model.wv.__getitem__('time'))"
@@ -1121,7 +1121,7 @@
11211121
}
11221122
],
11231123
"source": [
1124-
"tokens = word2vec_model.wv.index_to_key\n",
1124+
"tokens = word2vec_vect.wv.index_to_key\n",
11251125
"\n",
11261126
"print(tokens)"
11271127
]
@@ -1147,7 +1147,7 @@
11471147
"from sklearn.decomposition import PCA\n",
11481148
"import matplotlib.pyplot as plt\n",
11491149
"\n",
1150-
"X = glove_model[tokens]\n",
1150+
"X = glove_vect[tokens]\n",
11511151
"pca = PCA(n_components = 2)\n",
11521152
"result = pca.fit_transform(X)\n",
11531153
"\n",
@@ -1174,7 +1174,7 @@
11741174
],
11751175
"source": [
11761176
"# Find most similar words\n",
1177-
"similar_words = glove_model.most_similar('time', topn=5)\n",
1177+
"similar_words = glove_vect.most_similar('time', topn=5)\n",
11781178
"\n",
11791179
"print(similar_words)"
11801180
]
@@ -1201,7 +1201,7 @@
12011201
}
12021202
],
12031203
"source": [
1204-
"glove_model.most_similar(\"developer\", topn=5)"
1204+
"glove_vect.most_similar(\"developer\", topn=5)"
12051205
]
12061206
},
12071207
{
@@ -1224,7 +1224,7 @@
12241224
"source": [
12251225
"# Computing the similarity between word vectors\n",
12261226
"\n",
1227-
"glove_model.similarity(\"developer\", \"development\")"
1227+
"glove_vect.similarity(\"developer\", \"development\")"
12281228
]
12291229
},
12301230
{
@@ -1332,7 +1332,7 @@
13321332
}
13331333
],
13341334
"source": [
1335-
"df['glove_doc_embeddings'] = df[\"tokenised_sentences\"].apply(lambda x : get_document_vector(x, glove_model))\n",
1335+
"df['glove_doc_embeddings'] = df[\"tokenised_sentences\"].apply(lambda doc : get_document_vector(doc, glove_vect))\n",
13361336
"\n",
13371337
"df.head()"
13381338
]
@@ -1409,7 +1409,7 @@
14091409
"source": [
14101410
"from sentence_transformers import SentenceTransformer\n",
14111411
"\n",
1412-
"bert_model = SentenceTransformer('all-MiniLM-L6-v2')"
1412+
"bert_vect = SentenceTransformer('all-MiniLM-L6-v2')"
14131413
]
14141414
},
14151415
{
@@ -1792,8 +1792,8 @@
17921792
" 'Sentences are passed as a list of string.',\n",
17931793
" 'The quick brown fox jumps over the lazy dog.']\n",
17941794
"\n",
1795-
"# Sentences are encoded by calling model.encode()\n",
1796-
"embeddings = model.encode(sentences)\n",
1795+
"# Sentences are encoded by calling bert_vect.encode()\n",
1796+
"embeddings = bert_vect.encode(sentences)\n",
17971797
"\n",
17981798
"# Print the embeddings\n",
17991799
"for sentence, embedding in zip(sentences, embeddings):\n",
@@ -1841,9 +1841,9 @@
18411841
"source": [
18421842
"from sentence_transformers import util\n",
18431843
"\n",
1844-
"emb1 = model.encode(\"I am eating Mango\")\n",
1845-
"emb2 = model.encode(\"I like fruits\")\n",
1846-
"emb3 = model.encode(\"I work at Microsoft\")\n",
1844+
"emb1 = bert_vect.encode(\"I am eating Mango\")\n",
1845+
"emb2 = bert_vect.encode(\"I like fruits\")\n",
1846+
"emb3 = bert_vect.encode(\"I work at Microsoft\")\n",
18471847
"cos_sim_12 = util.cos_sim(emb1, emb2)\n",
18481848
"cos_sim_13 = util.cos_sim(emb1, emb3)\n",
18491849
"print(\"Cosine-Similarity between 1 and 2:\", cos_sim_12)\n",
@@ -1897,7 +1897,7 @@
18971897
" ]\n",
18981898
"\n",
18991899
"#Encode all sentences\n",
1900-
"embeddings = model.encode(sentences)\n",
1900+
"embeddings = bert_vect.encode(sentences)\n",
19011901
"\n",
19021902
"#Compute cosine similarity between all pairs\n",
19031903
"cos_sim = util.cos_sim(embeddings, embeddings)\n",
@@ -2013,7 +2013,7 @@
20132013
}
20142014
],
20152015
"source": [
2016-
"df['sbert_doc_embeddings'] = df['clean_text'].apply(model.encode)\n",
2016+
"df['sbert_doc_embeddings'] = df['clean_text'].apply(bert_vect.encode)\n",
20172017
"\n",
20182018
"df.head()"
20192019
]

0 commit comments

Comments
 (0)