imsanjoykb
diff --git a/‎Explotrary Data Analysis-EDA/Restaurant Customer Review/Restaurant.ipynb‎
Lines changed: 359 additions & 0 deletions b/‎Explotrary Data Analysis-EDA/Restaurant Customer Review/Restaurant.ipynb‎
Lines changed: 359 additions & 0 deletions
@@ -0,0 +1,359 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "peaceful-legend",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import nltk\n",
+ "import pandas as pd\n",
+ "from nltk.stem import WordNetLemmatizer \n",
+ "lemmatizer = WordNetLemmatizer()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "featured-strategy",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>Review</th>\n",
+ " <th>Liked</th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>Wow... Loved this place.</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1</th>\n",
+ " <td>Crust is not good.</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>2</th>\n",
+ " <td>Not tasty and the texture was just nasty.</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>3</th>\n",
+ " <td>Stopped by during the late May bank holiday of...</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>4</th>\n",
+ " <td>The selection on the menu was great and so wer...</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " Review Liked\n",
+ "0 Wow... Loved this place. 1\n",
+ "1 Crust is not good. 0\n",
+ "2 Not tasty and the texture was just nasty. 0\n",
+ "3 Stopped by during the late May bank holiday of... 1\n",
+ "4 The selection on the menu was great and so wer... 1"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = pd.read_csv(\"Restaurant_Reviews.tsv\",sep='\\t')\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "colonial-cassette",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'Wow... Loved this place.'"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.iloc[0]['Review']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "owned-adelaide",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import re\n",
+ "from nltk.stem import PorterStemmer\n",
+ "from nltk.corpus import stopwords\n",
+ "ps = PorterStemmer()\n",
+ "\n",
+ "corpus=[]\n",
+ "for i in range(0,len(df)):\n",
+ " Review = re.sub('[^a-zA-Z]',' ',df['Review'][i])\n",
+ " Review = Review.lower()\n",
+ " Review = Review.split()\n",
+ " Review = [ps.stem(word) for word in Review if word not in set(stopwords.words('english'))]\n",
+ " Review = ' '.join(Review)\n",
+ " corpus.append(Review)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "ambient-backing",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'wow love place'"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "corpus[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "brave-start",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Review 0\n",
+ "Liked 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.isnull().sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "cloudy-enterprise",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X = df['Review']\n",
+ "y = df['Liked']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "intellectual-latino",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 50.0\n",
+ "1 50.0\n",
+ "Name: Liked, dtype: float64"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "y.value_counts(1)*100"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "returning-cartridge",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Train Test Split\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "curious-modem",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+ "vectorizer = TfidfVectorizer()\n",
+ "X_train_vect = vectorizer.fit_transform(X_train)\n",
+ "X_test_vect = vectorizer.transform(X_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "ordered-consumption",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "LogisticRegression()"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.linear_model import LogisticRegression\n",
+ "clf = LogisticRegression(solver='lbfgs')\n",
+ "clf.fit(X_train_vect,y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "prepared-acrobat",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.75 0.85 0.80 143\n",
+ " 1 0.84 0.75 0.79 157\n",
+ "\n",
+ " accuracy 0.79 300\n",
+ " macro avg 0.80 0.80 0.79 300\n",
+ "weighted avg 0.80 0.79 0.79 300\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.metrics import accuracy_score, classification_report\n",
+ "y_pred = clf.predict(X_test_vect)\n",
+ "accuracy_score(y_test,y_pred)\n",
+ "print(classification_report(y_test,y_pred))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "universal-moore",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "This is Positive Review\n"
+ ]
+ }
+ ],
+ "source": [
+ "test = \"this resturnt is good\" \n",
+ "\n",
+ "a = re.sub('[^a-zA-Z]',' ',test)\n",
+ "a = a.lower()\n",
+ "a = a.split()\n",
+ "a = [lemmatizer.lemmatize(word) for word in a ]\n",
+ "a = ' '.join(a)\n",
+ "\n",
+ "\n",
+ "example_counts = vectorizer.transform([a])\n",
+ "\n",
+ "prediction =clf.predict(example_counts)\n",
+ "prediction[0]\n",
+ "\n",
+ "if prediction[0]==0:\n",
+ " print(\"This is Negative Review\")\n",
+ "elif prediction[0]==1:\n",
+ " print(\"This is Positive Review\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "colored-investment",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}