|
5 | 5 | "id": "7c0a730a-2cc5-4292-924c-50ef49c29faa", |
6 | 6 | "metadata": {}, |
7 | 7 | "source": [ |
8 | | - "# **Advance Text Vectorization - Word2Vec, Pretrained GloVe and BERT**\n", |
| 8 | + "# **Advance Text Vectorization - Word2Vec, Pretrained GloVe and Pretrained BERT**\n", |
9 | 9 | "\n", |
10 | 10 | "### **What's Covered?**\n", |
11 | 11 | "1. Text Vectorization\n", |
|
411 | 411 | "source": [ |
412 | 412 | "from gensim.models import Word2Vec\n", |
413 | 413 | "\n", |
414 | | - "word2vec_model = Word2Vec(df['tokenised_sentences'], vector_size=100, min_count=1)\n", |
| 414 | + "word2vec_vect = Word2Vec(df['tokenised_sentences'], vector_size=100, min_count=1)\n", |
415 | 415 | "\n", |
416 | | - "print(word2vec_model)" |
| 416 | + "print(word2vec_vect)" |
417 | 417 | ] |
418 | 418 | }, |
419 | 419 | { |
|
439 | 439 | } |
440 | 440 | ], |
441 | 441 | "source": [ |
442 | | - "# We can look at unique words by using 'vocabulary_'\n", |
| 442 | + "# We can check out what is learned by \"word2vec_vect\"\n", |
443 | 443 | "\n", |
444 | | - "print(f\"Number of documents used for Training: {word2vec_model.corpus_count}\")\n", |
| 444 | + "print(f\"Number of documents used for Training: {word2vec_vect.corpus_count}\")\n", |
445 | 445 | "print()\n", |
446 | | - "print(f\"Vocabulary size: {len(word2vec_model.wv.index_to_key)}\")\n", |
| 446 | + "print(f\"Vocabulary size: {len(word2vec_vect.wv.index_to_key)}\")\n", |
447 | 447 | "print()\n", |
448 | | - "print(f\"Vocabulary: {word2vec_model.wv.index_to_key}\")\n", |
| 448 | + "print(f\"Vocabulary: {word2vec_vect.wv.index_to_key}\")\n", |
449 | 449 | "print()\n", |
450 | | - "print(f\"Let's look at the vocabulary stored in the object: {word2vec_model.wv.key_to_index}\")\n", |
| 450 | + "print(f\"Let's look at the vocabulary stored in the object: {word2vec_vect.wv.key_to_index}\")\n", |
451 | 451 | "print()\n", |
452 | | - "print(f\"Vector Size: {word2vec_model.vector_size}\")" |
| 452 | + "print(f\"Vector Size: {word2vec_vect.vector_size}\")" |
453 | 453 | ] |
454 | 454 | }, |
455 | 455 | { |
|
468 | 468 | ], |
469 | 469 | "source": [ |
470 | 470 | "# Word frequencies\n", |
471 | | - "word_frequencies = {word: word2vec_model.wv.get_vecattr(word, \"count\") for word in word2vec_model.wv.index_to_key}\n", |
| 471 | + "word_frequencies = {word: word2vec_vect.wv.get_vecattr(word, \"count\") for word in word2vec_vect.wv.index_to_key}\n", |
472 | 472 | "\n", |
473 | 473 | "print(f\"Word frequencies: {list(word_frequencies.items())[:10]}\")" |
474 | 474 | ] |
|
516 | 516 | "source": [ |
517 | 517 | "# Getting vector for a word\n", |
518 | 518 | "\n", |
519 | | - "print(f\"Word Embedding Shape: { word2vec_model.wv['time'].shape }\")\n", |
| 519 | + "print(f\"Word Embedding Shape: { word2vec_vect.wv['time'].shape }\")\n", |
520 | 520 | "print()\n", |
521 | | - "print(word2vec_model.wv[\"time\"])\n", |
| 521 | + "print(word2vec_vect.wv[\"time\"])\n", |
522 | 522 | "\n", |
523 | 523 | "# # We can also use the following:\n", |
524 | | - "# print(word2vec_model.wv.__getitem__('time'))" |
| 524 | + "# print(word2vec_vect.wv.__getitem__('time'))" |
525 | 525 | ] |
526 | 526 | }, |
527 | 527 | { |
|
692 | 692 | "source": [ |
693 | 693 | "# Access the 100D vectors for all 7 words\n", |
694 | 694 | "\n", |
695 | | - "print(f\"Shape: { word2vec_model.wv[word2vec_model.wv.index_to_key].shape }\")\n", |
| 695 | + "print(f\"Shape: { word2vec_vect.wv[word2vec_vect.wv.index_to_key].shape }\")\n", |
696 | 696 | "print()\n", |
697 | | - "print(word2vec_model.wv[word2vec_model.wv.index_to_key])" |
| 697 | + "print(word2vec_vect.wv[word2vec_vect.wv.index_to_key])" |
698 | 698 | ] |
699 | 699 | }, |
700 | 700 | { |
|
705 | 705 | "outputs": [], |
706 | 706 | "source": [ |
707 | 707 | "# save model\n", |
708 | | - "# word2vec_model.save('model/first_word_vectors.bin')\n", |
| 708 | + "# word2vec_vect.save('model/first_word_vectors.bin')\n", |
709 | 709 | "\n", |
710 | 710 | "# # load model\n", |
711 | | - "# word2vec_model = Word2Vec.load('model/first_word_vectors.bin')\n", |
712 | | - "# print(word2vec_model)" |
| 711 | + "# word2vec_vect = Word2Vec.load('model/first_word_vectors.bin')\n", |
| 712 | + "# print(word2vec_vect)" |
713 | 713 | ] |
714 | 714 | }, |
715 | 715 | { |
|
735 | 735 | "from sklearn.decomposition import PCA\n", |
736 | 736 | "import matplotlib.pyplot as plt\n", |
737 | 737 | "\n", |
738 | | - "X = word2vec_model.wv[word2vec_model.wv.index_to_key]\n", |
| 738 | + "X = word2vec_vect.wv[word2vec_vect.wv.index_to_key]\n", |
739 | 739 | "pca = PCA(n_components = 2)\n", |
740 | 740 | "result = pca.fit_transform(X)\n", |
741 | 741 | "\n", |
742 | 742 | "# create a scatter plot of the projection\n", |
743 | 743 | "plt.scatter(result[:, 0], result[:, 1])\n", |
744 | | - "words = list(word2vec_model.wv.index_to_key)\n", |
| 744 | + "words = list(word2vec_vect.wv.index_to_key)\n", |
745 | 745 | "for i, word in enumerate(words):\n", |
746 | 746 | " plt.annotate(word, xy=(result[i, 0], result[i, 1]))\n", |
747 | 747 | "plt.show()" |
|
763 | 763 | ], |
764 | 764 | "source": [ |
765 | 765 | "# Find most similar words\n", |
766 | | - "similar_words = word2vec_model.wv.most_similar('time', topn=1)\n", |
| 766 | + "similar_words = word2vec_vect.wv.most_similar('time', topn=1)\n", |
767 | 767 | "\n", |
768 | 768 | "print(similar_words)" |
769 | 769 | ] |
|
788 | 788 | "source": [ |
789 | 789 | "# Computing the similarity between word vectors\n", |
790 | 790 | "\n", |
791 | | - "word2vec_model.wv.similarity('best', 'worst')" |
| 791 | + "word2vec_vect.wv.similarity('best', 'worst')" |
792 | 792 | ] |
793 | 793 | }, |
794 | 794 | { |
|
821 | 821 | "\n", |
822 | 822 | "sentence = ['best', 'bansal', 'time', 'kanav']\n", |
823 | 823 | "\n", |
824 | | - "vocab_tokens = [word for word in sentence if word in word2vec_model.wv.index_to_key]\n", |
| 824 | + "vocab_tokens = [word for word in sentence if word in word2vec_vect.wv.index_to_key]\n", |
825 | 825 | "\n", |
826 | 826 | "vocab_tokens" |
827 | 827 | ] |
|
870 | 870 | ], |
871 | 871 | "source": [ |
872 | 872 | "# Create document vectors by averaging word vectors\n", |
873 | | - "np.mean(word2vec_model.wv[vocab_tokens], axis=0)" |
| 873 | + "np.mean(word2vec_vect.wv[vocab_tokens], axis=0)" |
874 | 874 | ] |
875 | 875 | }, |
876 | 876 | { |
|
977 | 977 | } |
978 | 978 | ], |
979 | 979 | "source": [ |
980 | | - "df['w2v_doc_embeddings'] = df[\"tokenised_sentences\"].apply(lambda doc : get_document_vector(doc, word2vec_model.wv))\n", |
| 980 | + "df['w2v_doc_embeddings'] = df[\"tokenised_sentences\"].apply(lambda doc : get_document_vector(doc, word2vec_vect.wv))\n", |
981 | 981 | "\n", |
982 | 982 | "df.head()" |
983 | 983 | ] |
|
1020 | 1020 | "# # For the sake of this example, we will be loading pre-trained GloVe model\n", |
1021 | 1021 | "# # You can also choose a Word2Vec or a FastText model as well\n", |
1022 | 1022 | "\n", |
1023 | | - "# glove_model = api.load('glove-twitter-50')\n", |
| 1023 | + "# glove_vect = api.load('glove-twitter-50')\n", |
1024 | 1024 | "\n", |
1025 | 1025 | "# # Approax 200MB Size\n", |
1026 | 1026 | "# # Save Embeddings\n", |
1027 | | - "# glove_model.save('pretrained_models/.50d_glove_vec.kv')" |
| 1027 | + "# glove_vect.save('pretrained_models/.50d_glove_vec.kv')" |
1028 | 1028 | ] |
1029 | 1029 | }, |
1030 | 1030 | { |
|
1045 | 1045 | "# Load Embeddings\n", |
1046 | 1046 | "from gensim.models import KeyedVectors\n", |
1047 | 1047 | "\n", |
1048 | | - "glove_model = KeyedVectors.load('pretrained_models/.50d_glove_vec.kv')\n", |
| 1048 | + "glove_vect = KeyedVectors.load('pretrained_models/.50d_glove_vec.kv')\n", |
1049 | 1049 | "\n", |
1050 | | - "print(type(glove_model))" |
| 1050 | + "print(type(glove_vect))" |
1051 | 1051 | ] |
1052 | 1052 | }, |
1053 | 1053 | { |
|
1068 | 1068 | "source": [ |
1069 | 1069 | "# Vocabulary Size and Word Embedding Shape\n", |
1070 | 1070 | "\n", |
1071 | | - "print(f\"Vocabulary size: {len(glove_model.index_to_key)}\")\n", |
1072 | | - "print(f\"Vector Size: {glove_model.vector_size}\")" |
| 1071 | + "print(f\"Vocabulary size: {len(glove_vect.index_to_key)}\")\n", |
| 1072 | + "print(f\"Vector Size: {glove_vect.vector_size}\")" |
1073 | 1073 | ] |
1074 | 1074 | }, |
1075 | 1075 | { |
|
1098 | 1098 | "source": [ |
1099 | 1099 | "# Getting vector for a word\n", |
1100 | 1100 | "\n", |
1101 | | - "print(f\"Word Embedding Shape: { glove_model['college'].shape }\")\n", |
| 1101 | + "print(f\"Word Embedding Shape: { glove_vect['college'].shape }\")\n", |
1102 | 1102 | "print()\n", |
1103 | | - "print(glove_model['college'])\n", |
| 1103 | + "print(glove_vect['college'])\n", |
1104 | 1104 | "\n", |
1105 | 1105 | "# # We can also use the following:\n", |
1106 | 1106 | "# print(model.wv.__getitem__('time'))" |
|
1121 | 1121 | } |
1122 | 1122 | ], |
1123 | 1123 | "source": [ |
1124 | | - "tokens = word2vec_model.wv.index_to_key\n", |
| 1124 | + "tokens = word2vec_vect.wv.index_to_key\n", |
1125 | 1125 | "\n", |
1126 | 1126 | "print(tokens)" |
1127 | 1127 | ] |
|
1147 | 1147 | "from sklearn.decomposition import PCA\n", |
1148 | 1148 | "import matplotlib.pyplot as plt\n", |
1149 | 1149 | "\n", |
1150 | | - "X = glove_model[tokens]\n", |
| 1150 | + "X = glove_vect[tokens]\n", |
1151 | 1151 | "pca = PCA(n_components = 2)\n", |
1152 | 1152 | "result = pca.fit_transform(X)\n", |
1153 | 1153 | "\n", |
|
1174 | 1174 | ], |
1175 | 1175 | "source": [ |
1176 | 1176 | "# Find most similar words\n", |
1177 | | - "similar_words = glove_model.most_similar('time', topn=5)\n", |
| 1177 | + "similar_words = glove_vect.most_similar('time', topn=5)\n", |
1178 | 1178 | "\n", |
1179 | 1179 | "print(similar_words)" |
1180 | 1180 | ] |
|
1201 | 1201 | } |
1202 | 1202 | ], |
1203 | 1203 | "source": [ |
1204 | | - "glove_model.most_similar(\"developer\", topn=5)" |
| 1204 | + "glove_vect.most_similar(\"developer\", topn=5)" |
1205 | 1205 | ] |
1206 | 1206 | }, |
1207 | 1207 | { |
|
1224 | 1224 | "source": [ |
1225 | 1225 | "# Computing the similarity between word vectors\n", |
1226 | 1226 | "\n", |
1227 | | - "glove_model.similarity(\"developer\", \"development\")" |
| 1227 | + "glove_vect.similarity(\"developer\", \"development\")" |
1228 | 1228 | ] |
1229 | 1229 | }, |
1230 | 1230 | { |
|
1332 | 1332 | } |
1333 | 1333 | ], |
1334 | 1334 | "source": [ |
1335 | | - "df['glove_doc_embeddings'] = df[\"tokenised_sentences\"].apply(lambda x : get_document_vector(x, glove_model))\n", |
| 1335 | + "df['glove_doc_embeddings'] = df[\"tokenised_sentences\"].apply(lambda doc : get_document_vector(doc, glove_vect))\n", |
1336 | 1336 | "\n", |
1337 | 1337 | "df.head()" |
1338 | 1338 | ] |
|
1409 | 1409 | "source": [ |
1410 | 1410 | "from sentence_transformers import SentenceTransformer\n", |
1411 | 1411 | "\n", |
1412 | | - "bert_model = SentenceTransformer('all-MiniLM-L6-v2')" |
| 1412 | + "bert_vect = SentenceTransformer('all-MiniLM-L6-v2')" |
1413 | 1413 | ] |
1414 | 1414 | }, |
1415 | 1415 | { |
|
1792 | 1792 | " 'Sentences are passed as a list of string.',\n", |
1793 | 1793 | " 'The quick brown fox jumps over the lazy dog.']\n", |
1794 | 1794 | "\n", |
1795 | | - "# Sentences are encoded by calling model.encode()\n", |
1796 | | - "embeddings = model.encode(sentences)\n", |
| 1795 | + "# Sentences are encoded by calling bert_vect.encode()\n", |
| 1796 | + "embeddings = bert_vect.encode(sentences)\n", |
1797 | 1797 | "\n", |
1798 | 1798 | "# Print the embeddings\n", |
1799 | 1799 | "for sentence, embedding in zip(sentences, embeddings):\n", |
|
1841 | 1841 | "source": [ |
1842 | 1842 | "from sentence_transformers import util\n", |
1843 | 1843 | "\n", |
1844 | | - "emb1 = model.encode(\"I am eating Mango\")\n", |
1845 | | - "emb2 = model.encode(\"I like fruits\")\n", |
1846 | | - "emb3 = model.encode(\"I work at Microsoft\")\n", |
| 1844 | + "emb1 = bert_vect.encode(\"I am eating Mango\")\n", |
| 1845 | + "emb2 = bert_vect.encode(\"I like fruits\")\n", |
| 1846 | + "emb3 = bert_vect.encode(\"I work at Microsoft\")\n", |
1847 | 1847 | "cos_sim_12 = util.cos_sim(emb1, emb2)\n", |
1848 | 1848 | "cos_sim_13 = util.cos_sim(emb1, emb3)\n", |
1849 | 1849 | "print(\"Cosine-Similarity between 1 and 2:\", cos_sim_12)\n", |
|
1897 | 1897 | " ]\n", |
1898 | 1898 | "\n", |
1899 | 1899 | "#Encode all sentences\n", |
1900 | | - "embeddings = model.encode(sentences)\n", |
| 1900 | + "embeddings = bert_vect.encode(sentences)\n", |
1901 | 1901 | "\n", |
1902 | 1902 | "#Compute cosine similarity between all pairs\n", |
1903 | 1903 | "cos_sim = util.cos_sim(embeddings, embeddings)\n", |
|
2013 | 2013 | } |
2014 | 2014 | ], |
2015 | 2015 | "source": [ |
2016 | | - "df['sbert_doc_embeddings'] = df['clean_text'].apply(model.encode)\n", |
| 2016 | + "df['sbert_doc_embeddings'] = df['clean_text'].apply(bert_vect.encode)\n", |
2017 | 2017 | "\n", |
2018 | 2018 | "df.head()" |
2019 | 2019 | ] |
|
0 commit comments