Skip to content

Commit 6356570

Browse files
committed
[catboost/tutorials] Improve text_embedding_features tutorial
Note: mandatory check (NEED_CHECK) was skipped ref:aca587f2a8b976c46e15f6a132bb19dbae80eca6
1 parent 2899fd1 commit 6356570

File tree

1 file changed

+78
-84
lines changed

1 file changed

+78
-84
lines changed

events/2020_11_18_catboost_tutorial/text_embedding_features.ipynb

Lines changed: 78 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -526,6 +526,15 @@
526526
"from catboost.text_processing import Dictionary"
527527
]
528528
},
529+
{
530+
"cell_type": "code",
531+
"execution_count": null,
532+
"metadata": {},
533+
"outputs": [],
534+
"source": [
535+
"text_small_lemmatized_nltk"
536+
]
537+
},
529538
{
530539
"cell_type": "code",
531540
"execution_count": null,
@@ -560,6 +569,15 @@
560569
"!cat dictionary.tsv"
561570
]
562571
},
572+
{
573+
"cell_type": "code",
574+
"execution_count": null,
575+
"metadata": {},
576+
"outputs": [],
577+
"source": [
578+
"dictionary.apply([text_small_lemmatized_nltk[0]])"
579+
]
580+
},
563581
{
564582
"cell_type": "markdown",
565583
"metadata": {
@@ -594,35 +612,16 @@
594612
{
595613
"cell_type": "code",
596614
"execution_count": null,
597-
"metadata": {
598-
"colab": {
599-
"base_uri": "https://localhost:8080/",
600-
"height": 161
601-
},
602-
"colab_type": "code",
603-
"id": "7Ea944JbfFuu",
604-
"outputId": "5f788c52-345c-4703-957a-4f57dd29c418"
605-
},
606-
"outputs": [],
607-
"source": [
608-
"text_small_lemmatized_nltk"
609-
]
610-
},
611-
{
612-
"cell_type": "code",
613-
"execution_count": null,
614-
"metadata": {
615-
"colab": {
616-
"base_uri": "https://localhost:8080/",
617-
"height": 35
618-
},
619-
"colab_type": "code",
620-
"id": "bRm5Cf5qkzlJ",
621-
"outputId": "6226eea1-ab2b-4924-df6c-a006e71965f5"
622-
},
615+
"metadata": {},
623616
"outputs": [],
624617
"source": [
625-
"dictionary.apply([text_small_lemmatized_nltk[0]])"
618+
"X_proc_train_small, y_train_small = X_preprocessed_train[:1000]['review'].to_list(), y_train[:1000]\n",
619+
"X_proc_train_small = list(map(simple_tokenizer.tokenize, X_proc_train_small))\n",
620+
"X_proc_test_small, y_test_small = X_preprocessed_test[:1000]['review'].to_list(), y_test[:1000]\n",
621+
"X_proc_test_small = list(map(simple_tokenizer.tokenize, X_proc_test_small))\n",
622+
"\n",
623+
"dictionary = Dictionary(max_dictionary_size=100)\n",
624+
"dictionary.fit(X_proc_train_small);"
626625
]
627626
},
628627
{
@@ -643,11 +642,13 @@
643642
" features = np.zeros((len(tokenized_text), dictionary.size))\n",
644643
" for i, tokenized_sentence in enumerate(tokenized_text):\n",
645644
" indices = np.array(dictionary.apply([tokenized_sentence])[0])\n",
646-
" features[i, indices] = 1\n",
645+
" if len(indices) > 0:\n",
646+
" features[i, indices] = 1\n",
647647
" return features\n",
648648
"\n",
649-
"bow_features = bag_of_words(text_small_lemmatized_nltk, dictionary)\n",
650-
"bow_features"
649+
"X_bow_train_small = bag_of_words(X_proc_train_small, dictionary)\n",
650+
"X_bow_test_small = bag_of_words(X_proc_test_small, dictionary)\n",
651+
"X_bow_train_small.shape"
651652
]
652653
},
653654
{
@@ -663,24 +664,17 @@
663664
"from sklearn.linear_model import LogisticRegression\n",
664665
"from sklearn.naive_bayes import MultinomialNB\n",
665666
"from scipy.sparse import csr_matrix\n",
666-
"from sklearn.metrics import log_loss\n",
667+
"from sklearn.metrics import roc_auc_score\n",
667668
"\n",
668-
"def fit_linear_model(X, c):\n",
669+
"def fit_linear_model(X, y):\n",
669670
" model = LogisticRegression()\n",
670-
" model.fit(X, c)\n",
671+
" model.fit(X, y)\n",
671672
" return model\n",
672673
"\n",
673-
"def fit_naive_bayes(X, c):\n",
674-
" clf = MultinomialNB()\n",
675-
" if isinstance(X, csr_matrix):\n",
676-
" X.eliminate_zeros()\n",
677-
" clf.fit(X, c)\n",
678-
" return clf\n",
679-
"\n",
680-
"def evaluate_model_logloss(model, X, y):\n",
674+
"def evaluate_model_auc(model, X, y):\n",
681675
" y_pred = model.predict_proba(X)[:,1]\n",
682-
" metric = log_loss(y, y_pred)\n",
683-
" print('Logloss: ' + str(metric))"
676+
" metric = roc_auc_score(y, y_pred)\n",
677+
" print('AUC: ' + str(metric))"
684678
]
685679
},
686680
{
@@ -697,19 +691,16 @@
697691
},
698692
"outputs": [],
699693
"source": [
700-
"def evaluate_models(X, y):\n",
701-
" linear_model = fit_linear_model(bow_features, target_small)\n",
702-
" naive_bayes = fit_naive_bayes(bow_features, target_small)\n",
694+
"def evaluate_models(X_train, y_train, X_test, y_test):\n",
695+
" linear_model = fit_linear_model(X_train, y_train)\n",
703696
" \n",
704697
" print('Linear model')\n",
705-
" evaluate_model_logloss(linear_model, X, y)\n",
706-
" print('Naive bayes')\n",
707-
" evaluate_model_logloss(naive_bayes, X, y)\n",
698+
" evaluate_model_auc(linear_model, X_test, y_test)\n",
708699
" print('Comparing to constant prediction')\n",
709-
" logloss_constant_prediction = log_loss(y, np.ones(shape=(len(text_small), 2)) * 0.5)\n",
710-
" print('Logloss: ' + str(logloss_constant_prediction))\n",
700+
" auc_constant_prediction = roc_auc_score(y_test, np.ones(shape=(len(y_test), 1)) * 0.5)\n",
701+
" print('AUC: ' + str(auc_constant_prediction))\n",
711702
" \n",
712-
"evaluate_models(bow_features, target_small)"
703+
"evaluate_models(X_bow_train_small, y_train_small, X_bow_test_small, y_test_small)"
713704
]
714705
},
715706
{
@@ -726,11 +717,14 @@
726717
},
727718
"outputs": [],
728719
"source": [
729-
"dictionary = Dictionary(occurence_lower_bound=0)\n",
730-
"dictionary.fit(text_small_lemmatized_nltk)\n",
720+
"unigram_dictionary = Dictionary(occurence_lower_bound=0, max_dictionary_size=1000)\n",
721+
"unigram_dictionary.fit(X_proc_train_small)\n",
731722
"\n",
732-
"bow_features = bag_of_words(text_small_lemmatized_nltk, dictionary)\n",
733-
"evaluate_models(bow_features, target_small)"
723+
"X_bow_train_small = bag_of_words(X_proc_train_small, unigram_dictionary)\n",
724+
"X_bow_test_small = bag_of_words(X_proc_test_small, unigram_dictionary)\n",
725+
"print(X_bow_train_small.shape)\n",
726+
"\n",
727+
"evaluate_models(X_bow_train_small, y_train_small, X_bow_test_small, y_test_small)"
734728
]
735729
},
736730
{
@@ -800,8 +794,14 @@
800794
},
801795
"outputs": [],
802796
"source": [
803-
"bow_features = bag_of_words(text_small_lemmatized_nltk, dictionary)\n",
804-
"evaluate_models(bow_features, target_small)"
797+
"bigram_dictionary = Dictionary(occurence_lower_bound=0, max_dictionary_size=5000, gram_order=2)\n",
798+
"bigram_dictionary.fit(X_proc_train_small)\n",
799+
"\n",
800+
"X_bow_train_small = bag_of_words(X_proc_train_small, bigram_dictionary)\n",
801+
"X_bow_test_small = bag_of_words(X_proc_test_small, bigram_dictionary)\n",
802+
"print(X_bow_train_small.shape)\n",
803+
"\n",
804+
"evaluate_models(X_bow_train_small, y_train_small, X_bow_test_small, y_test_small)"
805805
]
806806
},
807807
{
@@ -828,18 +828,17 @@
828828
},
829829
"outputs": [],
830830
"source": [
831-
"dictionary1 = Dictionary(occurence_lower_bound=0)\n",
832-
"dictionary1.fit(text_small_lemmatized_nltk)\n",
831+
"X_bow_train_small = np.concatenate((\n",
832+
" bag_of_words(X_proc_train_small, unigram_dictionary),\n",
833+
" bag_of_words(X_proc_train_small, bigram_dictionary)\n",
834+
"), axis=1)\n",
835+
"X_bow_test_small = np.concatenate((\n",
836+
" bag_of_words(X_proc_test_small, unigram_dictionary),\n",
837+
" bag_of_words(X_proc_test_small, bigram_dictionary)\n",
838+
"), axis=1)\n",
839+
"print(X_bow_train_small.shape)\n",
833840
"\n",
834-
"bow_features1 = bag_of_words(text_small_lemmatized_nltk, dictionary1)\n",
835-
"\n",
836-
"dictionary2 = Dictionary(occurence_lower_bound=0, gram_order=2)\n",
837-
"dictionary2.fit(text_small_lemmatized_nltk)\n",
838-
"\n",
839-
"bow_features2 = bag_of_words(text_small_lemmatized_nltk, dictionary2)\n",
840-
"\n",
841-
"bow_features = np.concatenate((bow_features1, bow_features2), axis=1)\n",
842-
"evaluate_models(bow_features, target_small)"
841+
"evaluate_models(X_bow_train_small, y_train_small, X_bow_test_small, y_test_small)"
843842
]
844843
},
845844
{
@@ -1066,8 +1065,9 @@
10661065
"metadata": {},
10671066
"outputs": [],
10681067
"source": [
1069-
"X_embed_train_small, y_train_small = X_embed_train[:1000], y_train[:1000]\n",
1070-
"X_embed_test_small, y_test_small = X_embed_test[:1000], y_test[:1000]"
1068+
"X_embed_first_train_small, y_first_train_small = X_embed_train[:5000], y_train[:5000]\n",
1069+
"X_embed_second_train_small, y_second_train_small = X_embed_train[5000:10000], y_train[5000:10000]\n",
1070+
"X_embed_test_small, y_test_small = X_embed_test[:5000], y_test[:5000]"
10711071
]
10721072
},
10731073
{
@@ -1083,8 +1083,7 @@
10831083
"metadata": {},
10841084
"outputs": [],
10851085
"source": [
1086-
"linmodel = fit_linear_model(X_embed_train_small, y_train_small)\n",
1087-
"evaluate_model_logloss(linmodel, X_embed_test_small, y_test_small)"
1086+
"evaluate_models(X_embed_second_train_small, y_second_train_small, X_embed_test_small, y_test_small)"
10881087
]
10891088
},
10901089
{
@@ -1102,18 +1101,13 @@
11021101
"source": [
11031102
"from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
11041103
"\n",
1105-
"clf = LinearDiscriminantAnalysis()\n",
1106-
"clf.fit(X_embed_train_small[:500], y_train_small[:500])\n",
1107-
"\n",
1108-
"X_lda_train_small = clf.transform(X_embed_train_small[500:])\n",
1109-
"X_embed_lda_train_small = np.concatenate([X_embed_train_small[500:], X_lda_train_small], axis=1)\n",
1110-
"\n",
1111-
"X_lda_test_small = clf.transform(X_embed_test_small)\n",
1112-
"X_embed_lda_test_small = np.concatenate([X_embed_test_small, X_lda_test_small], axis=1)\n",
1113-
"\n",
1104+
"lda = LinearDiscriminantAnalysis(solver='svd')\n",
1105+
"lda.fit(X_embed_first_train_small, y_first_train_small)\n",
11141106
"\n",
1115-
"linmodel = fit_linear_model(X_embed_lda_train_small, y_train_small[500:])\n",
1116-
"evaluate_model_logloss(linmodel, X_embed_lda_test_small, y_test_small)"
1107+
"X_lda_train_small = lda.transform(X_embed_second_train_small)\n",
1108+
"X_lda_test_small = lda.transform(X_embed_test_small)\n",
1109+
"print(X_lda_train_small.shape)\n",
1110+
"evaluate_models(X_lda_train_small, y_second_train_small, X_lda_test_small, y_test_small)"
11171111
]
11181112
},
11191113
{

0 commit comments

Comments
 (0)