|
526 | 526 | "from catboost.text_processing import Dictionary" |
527 | 527 | ] |
528 | 528 | }, |
| 529 | + { |
| 530 | + "cell_type": "code", |
| 531 | + "execution_count": null, |
| 532 | + "metadata": {}, |
| 533 | + "outputs": [], |
| 534 | + "source": [ |
| 535 | + "text_small_lemmatized_nltk" |
| 536 | + ] |
| 537 | + }, |
529 | 538 | { |
530 | 539 | "cell_type": "code", |
531 | 540 | "execution_count": null, |
|
560 | 569 | "!cat dictionary.tsv" |
561 | 570 | ] |
562 | 571 | }, |
| 572 | + { |
| 573 | + "cell_type": "code", |
| 574 | + "execution_count": null, |
| 575 | + "metadata": {}, |
| 576 | + "outputs": [], |
| 577 | + "source": [ |
| 578 | + "dictionary.apply([text_small_lemmatized_nltk[0]])" |
| 579 | + ] |
| 580 | + }, |
563 | 581 | { |
564 | 582 | "cell_type": "markdown", |
565 | 583 | "metadata": { |
|
594 | 612 | { |
595 | 613 | "cell_type": "code", |
596 | 614 | "execution_count": null, |
597 | | - "metadata": { |
598 | | - "colab": { |
599 | | - "base_uri": "https://localhost:8080/", |
600 | | - "height": 161 |
601 | | - }, |
602 | | - "colab_type": "code", |
603 | | - "id": "7Ea944JbfFuu", |
604 | | - "outputId": "5f788c52-345c-4703-957a-4f57dd29c418" |
605 | | - }, |
606 | | - "outputs": [], |
607 | | - "source": [ |
608 | | - "text_small_lemmatized_nltk" |
609 | | - ] |
610 | | - }, |
611 | | - { |
612 | | - "cell_type": "code", |
613 | | - "execution_count": null, |
614 | | - "metadata": { |
615 | | - "colab": { |
616 | | - "base_uri": "https://localhost:8080/", |
617 | | - "height": 35 |
618 | | - }, |
619 | | - "colab_type": "code", |
620 | | - "id": "bRm5Cf5qkzlJ", |
621 | | - "outputId": "6226eea1-ab2b-4924-df6c-a006e71965f5" |
622 | | - }, |
| 615 | + "metadata": {}, |
623 | 616 | "outputs": [], |
624 | 617 | "source": [ |
625 | | - "dictionary.apply([text_small_lemmatized_nltk[0]])" |
| 618 | + "X_proc_train_small, y_train_small = X_preprocessed_train[:1000]['review'].to_list(), y_train[:1000]\n", |
| 619 | + "X_proc_train_small = list(map(simple_tokenizer.tokenize, X_proc_train_small))\n", |
| 620 | + "X_proc_test_small, y_test_small = X_preprocessed_test[:1000]['review'].to_list(), y_test[:1000]\n", |
| 621 | + "X_proc_test_small = list(map(simple_tokenizer.tokenize, X_proc_test_small))\n", |
| 622 | + "\n", |
| 623 | + "dictionary = Dictionary(max_dictionary_size=100)\n", |
| 624 | + "dictionary.fit(X_proc_train_small);" |
626 | 625 | ] |
627 | 626 | }, |
628 | 627 | { |
|
643 | 642 | " features = np.zeros((len(tokenized_text), dictionary.size))\n", |
644 | 643 | " for i, tokenized_sentence in enumerate(tokenized_text):\n", |
645 | 644 | " indices = np.array(dictionary.apply([tokenized_sentence])[0])\n", |
646 | | - " features[i, indices] = 1\n", |
| 645 | + " if len(indices) > 0:\n", |
| 646 | + " features[i, indices] = 1\n", |
647 | 647 | " return features\n", |
648 | 648 | "\n", |
649 | | - "bow_features = bag_of_words(text_small_lemmatized_nltk, dictionary)\n", |
650 | | - "bow_features" |
| 649 | + "X_bow_train_small = bag_of_words(X_proc_train_small, dictionary)\n", |
| 650 | + "X_bow_test_small = bag_of_words(X_proc_test_small, dictionary)\n", |
| 651 | + "X_bow_train_small.shape" |
651 | 652 | ] |
652 | 653 | }, |
653 | 654 | { |
|
663 | 664 | "from sklearn.linear_model import LogisticRegression\n", |
664 | 665 | "from sklearn.naive_bayes import MultinomialNB\n", |
665 | 666 | "from scipy.sparse import csr_matrix\n", |
666 | | - "from sklearn.metrics import log_loss\n", |
| 667 | + "from sklearn.metrics import roc_auc_score\n", |
667 | 668 | "\n", |
668 | | - "def fit_linear_model(X, c):\n", |
| 669 | + "def fit_linear_model(X, y):\n", |
669 | 670 | " model = LogisticRegression()\n", |
670 | | - " model.fit(X, c)\n", |
| 671 | + " model.fit(X, y)\n", |
671 | 672 | " return model\n", |
672 | 673 | "\n", |
673 | | - "def fit_naive_bayes(X, c):\n", |
674 | | - " clf = MultinomialNB()\n", |
675 | | - " if isinstance(X, csr_matrix):\n", |
676 | | - " X.eliminate_zeros()\n", |
677 | | - " clf.fit(X, c)\n", |
678 | | - " return clf\n", |
679 | | - "\n", |
680 | | - "def evaluate_model_logloss(model, X, y):\n", |
| 674 | + "def evaluate_model_auc(model, X, y):\n", |
681 | 675 | " y_pred = model.predict_proba(X)[:,1]\n", |
682 | | - " metric = log_loss(y, y_pred)\n", |
683 | | - " print('Logloss: ' + str(metric))" |
| 676 | + " metric = roc_auc_score(y, y_pred)\n", |
| 677 | + " print('AUC: ' + str(metric))" |
684 | 678 | ] |
685 | 679 | }, |
686 | 680 | { |
|
697 | 691 | }, |
698 | 692 | "outputs": [], |
699 | 693 | "source": [ |
700 | | - "def evaluate_models(X, y):\n", |
701 | | - " linear_model = fit_linear_model(bow_features, target_small)\n", |
702 | | - " naive_bayes = fit_naive_bayes(bow_features, target_small)\n", |
| 694 | + "def evaluate_models(X_train, y_train, X_test, y_test):\n", |
| 695 | + " linear_model = fit_linear_model(X_train, y_train)\n", |
703 | 696 | " \n", |
704 | 697 | " print('Linear model')\n", |
705 | | - " evaluate_model_logloss(linear_model, X, y)\n", |
706 | | - " print('Naive bayes')\n", |
707 | | - " evaluate_model_logloss(naive_bayes, X, y)\n", |
| 698 | + " evaluate_model_auc(linear_model, X_test, y_test)\n", |
708 | 699 | " print('Comparing to constant prediction')\n", |
709 | | - " logloss_constant_prediction = log_loss(y, np.ones(shape=(len(text_small), 2)) * 0.5)\n", |
710 | | - " print('Logloss: ' + str(logloss_constant_prediction))\n", |
| 700 | + " auc_constant_prediction = roc_auc_score(y_test, np.ones(shape=(len(y_test), 1)) * 0.5)\n", |
| 701 | + " print('AUC: ' + str(auc_constant_prediction))\n", |
711 | 702 | " \n", |
712 | | - "evaluate_models(bow_features, target_small)" |
| 703 | + "evaluate_models(X_bow_train_small, y_train_small, X_bow_test_small, y_test_small)" |
713 | 704 | ] |
714 | 705 | }, |
715 | 706 | { |
|
726 | 717 | }, |
727 | 718 | "outputs": [], |
728 | 719 | "source": [ |
729 | | - "dictionary = Dictionary(occurence_lower_bound=0)\n", |
730 | | - "dictionary.fit(text_small_lemmatized_nltk)\n", |
| 720 | + "unigram_dictionary = Dictionary(occurence_lower_bound=0, max_dictionary_size=1000)\n", |
| 721 | + "unigram_dictionary.fit(X_proc_train_small)\n", |
731 | 722 | "\n", |
732 | | - "bow_features = bag_of_words(text_small_lemmatized_nltk, dictionary)\n", |
733 | | - "evaluate_models(bow_features, target_small)" |
| 723 | + "X_bow_train_small = bag_of_words(X_proc_train_small, unigram_dictionary)\n", |
| 724 | + "X_bow_test_small = bag_of_words(X_proc_test_small, unigram_dictionary)\n", |
| 725 | + "print(X_bow_train_small.shape)\n", |
| 726 | + "\n", |
| 727 | + "evaluate_models(X_bow_train_small, y_train_small, X_bow_test_small, y_test_small)" |
734 | 728 | ] |
735 | 729 | }, |
736 | 730 | { |
|
800 | 794 | }, |
801 | 795 | "outputs": [], |
802 | 796 | "source": [ |
803 | | - "bow_features = bag_of_words(text_small_lemmatized_nltk, dictionary)\n", |
804 | | - "evaluate_models(bow_features, target_small)" |
| 797 | + "bigram_dictionary = Dictionary(occurence_lower_bound=0, max_dictionary_size=5000, gram_order=2)\n", |
| 798 | + "bigram_dictionary.fit(X_proc_train_small)\n", |
| 799 | + "\n", |
| 800 | + "X_bow_train_small = bag_of_words(X_proc_train_small, bigram_dictionary)\n", |
| 801 | + "X_bow_test_small = bag_of_words(X_proc_test_small, bigram_dictionary)\n", |
| 802 | + "print(X_bow_train_small.shape)\n", |
| 803 | + "\n", |
| 804 | + "evaluate_models(X_bow_train_small, y_train_small, X_bow_test_small, y_test_small)" |
805 | 805 | ] |
806 | 806 | }, |
807 | 807 | { |
|
828 | 828 | }, |
829 | 829 | "outputs": [], |
830 | 830 | "source": [ |
831 | | - "dictionary1 = Dictionary(occurence_lower_bound=0)\n", |
832 | | - "dictionary1.fit(text_small_lemmatized_nltk)\n", |
| 831 | + "X_bow_train_small = np.concatenate((\n", |
| 832 | + " bag_of_words(X_proc_train_small, unigram_dictionary),\n", |
| 833 | + " bag_of_words(X_proc_train_small, bigram_dictionary)\n", |
| 834 | + "), axis=1)\n", |
| 835 | + "X_bow_test_small = np.concatenate((\n", |
| 836 | + " bag_of_words(X_proc_test_small, unigram_dictionary),\n", |
| 837 | + " bag_of_words(X_proc_test_small, bigram_dictionary)\n", |
| 838 | + "), axis=1)\n", |
| 839 | + "print(X_bow_train_small.shape)\n", |
833 | 840 | "\n", |
834 | | - "bow_features1 = bag_of_words(text_small_lemmatized_nltk, dictionary1)\n", |
835 | | - "\n", |
836 | | - "dictionary2 = Dictionary(occurence_lower_bound=0, gram_order=2)\n", |
837 | | - "dictionary2.fit(text_small_lemmatized_nltk)\n", |
838 | | - "\n", |
839 | | - "bow_features2 = bag_of_words(text_small_lemmatized_nltk, dictionary2)\n", |
840 | | - "\n", |
841 | | - "bow_features = np.concatenate((bow_features1, bow_features2), axis=1)\n", |
842 | | - "evaluate_models(bow_features, target_small)" |
| 841 | + "evaluate_models(X_bow_train_small, y_train_small, X_bow_test_small, y_test_small)" |
843 | 842 | ] |
844 | 843 | }, |
845 | 844 | { |
|
1066 | 1065 | "metadata": {}, |
1067 | 1066 | "outputs": [], |
1068 | 1067 | "source": [ |
1069 | | - "X_embed_train_small, y_train_small = X_embed_train[:1000], y_train[:1000]\n", |
1070 | | - "X_embed_test_small, y_test_small = X_embed_test[:1000], y_test[:1000]" |
| 1068 | + "X_embed_first_train_small, y_first_train_small = X_embed_train[:5000], y_train[:5000]\n", |
| 1069 | + "X_embed_second_train_small, y_second_train_small = X_embed_train[5000:10000], y_train[5000:10000]\n", |
| 1070 | + "X_embed_test_small, y_test_small = X_embed_test[:5000], y_test[:5000]" |
1071 | 1071 | ] |
1072 | 1072 | }, |
1073 | 1073 | { |
|
1083 | 1083 | "metadata": {}, |
1084 | 1084 | "outputs": [], |
1085 | 1085 | "source": [ |
1086 | | - "linmodel = fit_linear_model(X_embed_train_small, y_train_small)\n", |
1087 | | - "evaluate_model_logloss(linmodel, X_embed_test_small, y_test_small)" |
| 1086 | + "evaluate_models(X_embed_second_train_small, y_second_train_small, X_embed_test_small, y_test_small)" |
1088 | 1087 | ] |
1089 | 1088 | }, |
1090 | 1089 | { |
|
1102 | 1101 | "source": [ |
1103 | 1102 | "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n", |
1104 | 1103 | "\n", |
1105 | | - "clf = LinearDiscriminantAnalysis()\n", |
1106 | | - "clf.fit(X_embed_train_small[:500], y_train_small[:500])\n", |
1107 | | - "\n", |
1108 | | - "X_lda_train_small = clf.transform(X_embed_train_small[500:])\n", |
1109 | | - "X_embed_lda_train_small = np.concatenate([X_embed_train_small[500:], X_lda_train_small], axis=1)\n", |
1110 | | - "\n", |
1111 | | - "X_lda_test_small = clf.transform(X_embed_test_small)\n", |
1112 | | - "X_embed_lda_test_small = np.concatenate([X_embed_test_small, X_lda_test_small], axis=1)\n", |
1113 | | - "\n", |
| 1104 | + "lda = LinearDiscriminantAnalysis(solver='svd')\n", |
| 1105 | + "lda.fit(X_embed_first_train_small, y_first_train_small)\n", |
1114 | 1106 | "\n", |
1115 | | - "linmodel = fit_linear_model(X_embed_lda_train_small, y_train_small[500:])\n", |
1116 | | - "evaluate_model_logloss(linmodel, X_embed_lda_test_small, y_test_small)" |
| 1107 | + "X_lda_train_small = lda.transform(X_embed_second_train_small)\n", |
| 1108 | + "X_lda_test_small = lda.transform(X_embed_test_small)\n", |
| 1109 | + "print(X_lda_train_small.shape)\n", |
| 1110 | + "evaluate_models(X_lda_train_small, y_second_train_small, X_lda_test_small, y_test_small)" |
1117 | 1111 | ] |
1118 | 1112 | }, |
1119 | 1113 | { |
|
0 commit comments