ResoVerse-io
diff --git a/‎Module 4 - Machine Learning/02. Working with Text Data/4. Advance Text Vectorization/advance_text_vectorization.ipynb‎
Lines changed: 47 additions & 47 deletions b/‎Module 4 - Machine Learning/02. Working with Text Data/4. Advance Text Vectorization/advance_text_vectorization.ipynb‎
Lines changed: 47 additions & 47 deletions
@@ -5,7 +5,7 @@
  "id": "7c0a730a-2cc5-4292-924c-50ef49c29faa",
  "metadata": {},
  "source": [
- "# **Advance Text Vectorization - Word2Vec, Pretrained GloVe and BERT**\n",
+ "# **Advance Text Vectorization - Word2Vec, Pretrained GloVe and Pretrained BERT**\n",
  "\n",
  "### **What's Covered?**\n",
  "1. Text Vectorization\n",
@@ -411,9 +411,9 @@
  "source": [
  "from gensim.models import Word2Vec\n",
  "\n",
- "word2vec_model = Word2Vec(df['tokenised_sentences'], vector_size=100, min_count=1)\n",
+ "word2vec_vect = Word2Vec(df['tokenised_sentences'], vector_size=100, min_count=1)\n",
  "\n",
- "print(word2vec_model)"
+ "print(word2vec_vect)"
  ]
  },
  {
@@ -439,17 +439,17 @@
  }
  ],
  "source": [
- "# We can look at unique words by using 'vocabulary_'\n",
+ "# We can check out what is learned by \"word2vec_vect\"\n",
  "\n",
- "print(f\"Number of documents used for Training: {word2vec_model.corpus_count}\")\n",
+ "print(f\"Number of documents used for Training: {word2vec_vect.corpus_count}\")\n",
  "print()\n",
- "print(f\"Vocabulary size: {len(word2vec_model.wv.index_to_key)}\")\n",
+ "print(f\"Vocabulary size: {len(word2vec_vect.wv.index_to_key)}\")\n",
  "print()\n",
- "print(f\"Vocabulary: {word2vec_model.wv.index_to_key}\")\n",
+ "print(f\"Vocabulary: {word2vec_vect.wv.index_to_key}\")\n",
  "print()\n",
- "print(f\"Let's look at the vocabulary stored in the object: {word2vec_model.wv.key_to_index}\")\n",
+ "print(f\"Let's look at the vocabulary stored in the object: {word2vec_vect.wv.key_to_index}\")\n",
  "print()\n",
- "print(f\"Vector Size: {word2vec_model.vector_size}\")"
+ "print(f\"Vector Size: {word2vec_vect.vector_size}\")"
  ]
  },
  {
@@ -468,7 +468,7 @@
  ],
  "source": [
  "# Word frequencies\n",
- "word_frequencies = {word: word2vec_model.wv.get_vecattr(word, \"count\") for word in word2vec_model.wv.index_to_key}\n",
+ "word_frequencies = {word: word2vec_vect.wv.get_vecattr(word, \"count\") for word in word2vec_vect.wv.index_to_key}\n",
  "\n",
  "print(f\"Word frequencies: {list(word_frequencies.items())[:10]}\")"
  ]
@@ -516,12 +516,12 @@
  "source": [
  "# Getting vector for a word\n",
  "\n",
- "print(f\"Word Embedding Shape: { word2vec_model.wv['time'].shape }\")\n",
+ "print(f\"Word Embedding Shape: { word2vec_vect.wv['time'].shape }\")\n",
  "print()\n",
- "print(word2vec_model.wv[\"time\"])\n",
+ "print(word2vec_vect.wv[\"time\"])\n",
  "\n",
  "# # We can also use the following:\n",
- "# print(word2vec_model.wv.__getitem__('time'))"
+ "# print(word2vec_vect.wv.__getitem__('time'))"
  ]
  },
  {
@@ -692,9 +692,9 @@
  "source": [
  "# Access the 100D vectors for all 7 words\n",
  "\n",
- "print(f\"Shape: { word2vec_model.wv[word2vec_model.wv.index_to_key].shape }\")\n",
+ "print(f\"Shape: { word2vec_vect.wv[word2vec_vect.wv.index_to_key].shape }\")\n",
  "print()\n",
- "print(word2vec_model.wv[word2vec_model.wv.index_to_key])"
+ "print(word2vec_vect.wv[word2vec_vect.wv.index_to_key])"
  ]
  },
  {
@@ -705,11 +705,11 @@
  "outputs": [],
  "source": [
  "# save model\n",
- "# word2vec_model.save('model/first_word_vectors.bin')\n",
+ "# word2vec_vect.save('model/first_word_vectors.bin')\n",
  "\n",
  "# # load model\n",
- "# word2vec_model = Word2Vec.load('model/first_word_vectors.bin')\n",
- "# print(word2vec_model)"
+ "# word2vec_vect = Word2Vec.load('model/first_word_vectors.bin')\n",
+ "# print(word2vec_vect)"
  ]
  },
  {
@@ -735,13 +735,13 @@
  "from sklearn.decomposition import PCA\n",
  "import matplotlib.pyplot as plt\n",
  "\n",
- "X = word2vec_model.wv[word2vec_model.wv.index_to_key]\n",
+ "X = word2vec_vect.wv[word2vec_vect.wv.index_to_key]\n",
  "pca = PCA(n_components = 2)\n",
  "result = pca.fit_transform(X)\n",
  "\n",
  "# create a scatter plot of the projection\n",
  "plt.scatter(result[:, 0], result[:, 1])\n",
- "words = list(word2vec_model.wv.index_to_key)\n",
+ "words = list(word2vec_vect.wv.index_to_key)\n",
  "for i, word in enumerate(words):\n",
  " plt.annotate(word, xy=(result[i, 0], result[i, 1]))\n",
  "plt.show()"
@@ -763,7 +763,7 @@
  ],
  "source": [
  "# Find most similar words\n",
- "similar_words = word2vec_model.wv.most_similar('time', topn=1)\n",
+ "similar_words = word2vec_vect.wv.most_similar('time', topn=1)\n",
  "\n",
  "print(similar_words)"
  ]
@@ -788,7 +788,7 @@
  "source": [
  "# Computing the similarity between word vectors\n",
  "\n",
- "word2vec_model.wv.similarity('best', 'worst')"
+ "word2vec_vect.wv.similarity('best', 'worst')"
  ]
  },
  {
@@ -821,7 +821,7 @@
  "\n",
  "sentence = ['best', 'bansal', 'time', 'kanav']\n",
  "\n",
- "vocab_tokens = [word for word in sentence if word in word2vec_model.wv.index_to_key]\n",
+ "vocab_tokens = [word for word in sentence if word in word2vec_vect.wv.index_to_key]\n",
  "\n",
  "vocab_tokens"
  ]
@@ -870,7 +870,7 @@
  ],
  "source": [
  "# Create document vectors by averaging word vectors\n",
- "np.mean(word2vec_model.wv[vocab_tokens], axis=0)"
+ "np.mean(word2vec_vect.wv[vocab_tokens], axis=0)"
  ]
  },
  {
@@ -977,7 +977,7 @@
  }
  ],
  "source": [
- "df['w2v_doc_embeddings'] = df[\"tokenised_sentences\"].apply(lambda doc : get_document_vector(doc, word2vec_model.wv))\n",
+ "df['w2v_doc_embeddings'] = df[\"tokenised_sentences\"].apply(lambda doc : get_document_vector(doc, word2vec_vect.wv))\n",
  "\n",
  "df.head()"
  ]
@@ -1020,11 +1020,11 @@
  "# # For the sake of this example, we will be loading pre-trained GloVe model\n",
  "# # You can also choose a Word2Vec or a FastText model as well\n",
  "\n",
- "# glove_model = api.load('glove-twitter-50')\n",
+ "# glove_vect = api.load('glove-twitter-50')\n",
  "\n",
  "# # Approax 200MB Size\n",
  "# # Save Embeddings\n",
- "# glove_model.save('pretrained_models/.50d_glove_vec.kv')"
+ "# glove_vect.save('pretrained_models/.50d_glove_vec.kv')"
  ]
  },
  {
@@ -1045,9 +1045,9 @@
  "# Load Embeddings\n",
  "from gensim.models import KeyedVectors\n",
  "\n",
- "glove_model = KeyedVectors.load('pretrained_models/.50d_glove_vec.kv')\n",
+ "glove_vect = KeyedVectors.load('pretrained_models/.50d_glove_vec.kv')\n",
  "\n",
- "print(type(glove_model))"
+ "print(type(glove_vect))"
  ]
  },
  {
@@ -1068,8 +1068,8 @@
  "source": [
  "# Vocabulary Size and Word Embedding Shape\n",
  "\n",
- "print(f\"Vocabulary size: {len(glove_model.index_to_key)}\")\n",
- "print(f\"Vector Size: {glove_model.vector_size}\")"
+ "print(f\"Vocabulary size: {len(glove_vect.index_to_key)}\")\n",
+ "print(f\"Vector Size: {glove_vect.vector_size}\")"
  ]
  },
  {
@@ -1098,9 +1098,9 @@
  "source": [
  "# Getting vector for a word\n",
  "\n",
- "print(f\"Word Embedding Shape: { glove_model['college'].shape }\")\n",
+ "print(f\"Word Embedding Shape: { glove_vect['college'].shape }\")\n",
  "print()\n",
- "print(glove_model['college'])\n",
+ "print(glove_vect['college'])\n",
  "\n",
  "# # We can also use the following:\n",
  "# print(model.wv.__getitem__('time'))"
@@ -1121,7 +1121,7 @@
  }
  ],
  "source": [
- "tokens = word2vec_model.wv.index_to_key\n",
+ "tokens = word2vec_vect.wv.index_to_key\n",
  "\n",
  "print(tokens)"
  ]
@@ -1147,7 +1147,7 @@
  "from sklearn.decomposition import PCA\n",
  "import matplotlib.pyplot as plt\n",
  "\n",
- "X = glove_model[tokens]\n",
+ "X = glove_vect[tokens]\n",
  "pca = PCA(n_components = 2)\n",
  "result = pca.fit_transform(X)\n",
  "\n",
@@ -1174,7 +1174,7 @@
  ],
  "source": [
  "# Find most similar words\n",
- "similar_words = glove_model.most_similar('time', topn=5)\n",
+ "similar_words = glove_vect.most_similar('time', topn=5)\n",
  "\n",
  "print(similar_words)"
  ]
@@ -1201,7 +1201,7 @@
  }
  ],
  "source": [
- "glove_model.most_similar(\"developer\", topn=5)"
+ "glove_vect.most_similar(\"developer\", topn=5)"
  ]
  },
  {
@@ -1224,7 +1224,7 @@
  "source": [
  "# Computing the similarity between word vectors\n",
  "\n",
- "glove_model.similarity(\"developer\", \"development\")"
+ "glove_vect.similarity(\"developer\", \"development\")"
  ]
  },
  {
@@ -1332,7 +1332,7 @@
  }
  ],
  "source": [
- "df['glove_doc_embeddings'] = df[\"tokenised_sentences\"].apply(lambda x : get_document_vector(x, glove_model))\n",
+ "df['glove_doc_embeddings'] = df[\"tokenised_sentences\"].apply(lambda doc : get_document_vector(doc, glove_vect))\n",
  "\n",
  "df.head()"
  ]
@@ -1409,7 +1409,7 @@
  "source": [
  "from sentence_transformers import SentenceTransformer\n",
  "\n",
- "bert_model = SentenceTransformer('all-MiniLM-L6-v2')"
+ "bert_vect = SentenceTransformer('all-MiniLM-L6-v2')"
  ]
  },
  {
@@ -1792,8 +1792,8 @@
  " 'Sentences are passed as a list of string.',\n",
  " 'The quick brown fox jumps over the lazy dog.']\n",
  "\n",
- "# Sentences are encoded by calling model.encode()\n",
- "embeddings = model.encode(sentences)\n",
+ "# Sentences are encoded by calling bert_vect.encode()\n",
+ "embeddings = bert_vect.encode(sentences)\n",
  "\n",
  "# Print the embeddings\n",
  "for sentence, embedding in zip(sentences, embeddings):\n",
@@ -1841,9 +1841,9 @@
  "source": [
  "from sentence_transformers import util\n",
  "\n",
- "emb1 = model.encode(\"I am eating Mango\")\n",
- "emb2 = model.encode(\"I like fruits\")\n",
- "emb3 = model.encode(\"I work at Microsoft\")\n",
+ "emb1 = bert_vect.encode(\"I am eating Mango\")\n",
+ "emb2 = bert_vect.encode(\"I like fruits\")\n",
+ "emb3 = bert_vect.encode(\"I work at Microsoft\")\n",
  "cos_sim_12 = util.cos_sim(emb1, emb2)\n",
  "cos_sim_13 = util.cos_sim(emb1, emb3)\n",
  "print(\"Cosine-Similarity between 1 and 2:\", cos_sim_12)\n",
@@ -1897,7 +1897,7 @@
  " ]\n",
  "\n",
  "#Encode all sentences\n",
- "embeddings = model.encode(sentences)\n",
+ "embeddings = bert_vect.encode(sentences)\n",
  "\n",
  "#Compute cosine similarity between all pairs\n",
  "cos_sim = util.cos_sim(embeddings, embeddings)\n",
@@ -2013,7 +2013,7 @@
  }
  ],
  "source": [
- "df['sbert_doc_embeddings'] = df['clean_text'].apply(model.encode)\n",
+ "df['sbert_doc_embeddings'] = df['clean_text'].apply(bert_vect.encode)\n",
  "\n",
  "df.head()"
  ]