Skip to content

Commit 19c5d70

Browse files
committed
Update intro_data_modelling.ipynb
1 parent 319fa93 commit 19c5d70

File tree

1 file changed

+121
-22
lines changed
  • Module 4 - Machine Learning/01. Data Preparation and Modelling with sklearn/1. Hands-on with sklearn/code

1 file changed

+121
-22
lines changed

Module 4 - Machine Learning/01. Data Preparation and Modelling with sklearn/1. Hands-on with sklearn/code/intro_data_modelling.ipynb

Lines changed: 121 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -413,7 +413,7 @@
413413
{
414414
"data": {
415415
"text/plain": [
416-
"<seaborn.axisgrid.PairGrid at 0x254702cd250>"
416+
"<seaborn.axisgrid.PairGrid at 0x17f4039bca0>"
417417
]
418418
},
419419
"execution_count": 10,
@@ -686,7 +686,7 @@
686686
}
687687
],
688688
"source": [
689-
"# Data-preprocessing: Standardizing the data\n",
689+
"# Data-preprocessing of X_train: Standardizing the data\n",
690690
"\n",
691691
"from sklearn.preprocessing import StandardScaler\n",
692692
"scaler = StandardScaler()\n",
@@ -717,6 +717,86 @@
717717
"print(\"Std of each column:\", np.sqrt(scaler.var_))"
718718
]
719719
},
720+
{
721+
"cell_type": "code",
722+
"execution_count": 17,
723+
"metadata": {},
724+
"outputs": [],
725+
"source": [
726+
"# Data-preprocessing of y_train: Encoding the data\n",
727+
"\n",
728+
"from sklearn.preprocessing import LabelEncoder\n",
729+
"\n",
730+
"# Apply Label Encoding to convert target classes into numerical labels\n",
731+
"label_encoder = LabelEncoder()\n",
732+
"\n",
733+
"y_train_transformed = label_encoder.fit_transform(y_train)"
734+
]
735+
},
736+
{
737+
"cell_type": "code",
738+
"execution_count": 18,
739+
"metadata": {},
740+
"outputs": [
741+
{
742+
"data": {
743+
"text/plain": [
744+
"array([1, 1, 2, 0, 2])"
745+
]
746+
},
747+
"execution_count": 18,
748+
"metadata": {},
749+
"output_type": "execute_result"
750+
}
751+
],
752+
"source": [
753+
"y_train_transformed[:5]"
754+
]
755+
},
756+
{
757+
"cell_type": "code",
758+
"execution_count": 19,
759+
"metadata": {},
760+
"outputs": [
761+
{
762+
"name": "stdout",
763+
"output_type": "stream",
764+
"text": [
765+
"['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']\n"
766+
]
767+
}
768+
],
769+
"source": [
770+
"# Lets check the mapping\n",
771+
"\n",
772+
"print(label_encoder.classes_)"
773+
]
774+
},
775+
{
776+
"cell_type": "code",
777+
"execution_count": 20,
778+
"metadata": {},
779+
"outputs": [
780+
{
781+
"data": {
782+
"text/plain": [
783+
"61 Iris-versicolor\n",
784+
"92 Iris-versicolor\n",
785+
"112 Iris-virginica\n",
786+
"2 Iris-setosa\n",
787+
"141 Iris-virginica\n",
788+
"Name: Species, dtype: object"
789+
]
790+
},
791+
"execution_count": 20,
792+
"metadata": {},
793+
"output_type": "execute_result"
794+
}
795+
],
796+
"source": [
797+
"y_train.head()"
798+
]
799+
},
720800
{
721801
"cell_type": "markdown",
722802
"metadata": {},
@@ -726,7 +806,7 @@
726806
},
727807
{
728808
"cell_type": "code",
729-
"execution_count": 17,
809+
"execution_count": 21,
730810
"metadata": {},
731811
"outputs": [
732812
{
@@ -738,7 +818,7 @@
738818
"LogisticRegression()"
739819
]
740820
},
741-
"execution_count": 17,
821+
"execution_count": 21,
742822
"metadata": {},
743823
"output_type": "execute_result"
744824
}
@@ -751,7 +831,7 @@
751831
"classifier = LogisticRegression()\n",
752832
"\n",
753833
"# Train a model on training data\n",
754-
"classifier.fit(X_train_transformed, y_train)"
834+
"classifier.fit(X_train_transformed, y_train_transformed)"
755835
]
756836
},
757837
{
@@ -763,7 +843,7 @@
763843
},
764844
{
765845
"cell_type": "code",
766-
"execution_count": 18,
846+
"execution_count": 22,
767847
"metadata": {},
768848
"outputs": [
769849
{
@@ -781,6 +861,25 @@
781861
"print(X_test_transformed.shape)"
782862
]
783863
},
864+
{
865+
"cell_type": "code",
866+
"execution_count": 23,
867+
"metadata": {},
868+
"outputs": [
869+
{
870+
"name": "stdout",
871+
"output_type": "stream",
872+
"text": [
873+
"(38,)\n"
874+
]
875+
}
876+
],
877+
"source": [
878+
"y_test_transformed = label_encoder.transform(y_test)\n",
879+
"\n",
880+
"print(y_test_transformed.shape)"
881+
]
882+
},
784883
{
785884
"cell_type": "markdown",
786885
"metadata": {},
@@ -790,7 +889,7 @@
790889
},
791890
{
792891
"cell_type": "code",
793-
"execution_count": 19,
892+
"execution_count": 24,
794893
"metadata": {},
795894
"outputs": [],
796895
"source": [
@@ -807,7 +906,7 @@
807906
},
808907
{
809908
"cell_type": "code",
810-
"execution_count": 20,
909+
"execution_count": 25,
811910
"metadata": {},
812911
"outputs": [
813912
{
@@ -816,7 +915,7 @@
816915
"0.9736842105263158"
817916
]
818917
},
819-
"execution_count": 20,
918+
"execution_count": 25,
820919
"metadata": {},
821920
"output_type": "execute_result"
822921
}
@@ -826,7 +925,7 @@
826925
"from sklearn import metrics\n",
827926
"\n",
828927
"# Calculate accuracy score\n",
829-
"metrics.accuracy_score(y_test, y_test_pred)"
928+
"metrics.accuracy_score(y_test_transformed, y_test_pred)"
830929
]
831930
},
832931
{
@@ -838,7 +937,7 @@
838937
},
839938
{
840939
"cell_type": "code",
841-
"execution_count": 21,
940+
"execution_count": 26,
842941
"metadata": {},
843942
"outputs": [
844943
{
@@ -847,7 +946,7 @@
847946
"0.9736842105263158"
848947
]
849948
},
850-
"execution_count": 21,
949+
"execution_count": 26,
851950
"metadata": {},
852951
"output_type": "execute_result"
853952
}
@@ -860,13 +959,13 @@
860959
"classifier = KNeighborsClassifier()\n",
861960
"\n",
862961
"# Training the model\n",
863-
"classifier.fit(X_train_transformed, y_train)\n",
962+
"classifier.fit(X_train_transformed, y_train_transformed)\n",
864963
"\n",
865964
"# Prediction on unseen data\n",
866965
"y_test_pred = classifier.predict(X_test_transformed)\n",
867966
"\n",
868967
"# Evaluation\n",
869-
"metrics.accuracy_score(y_test, y_test_pred)"
968+
"metrics.accuracy_score(y_test_transformed, y_test_pred)"
870969
]
871970
},
872971
{
@@ -878,7 +977,7 @@
878977
},
879978
{
880979
"cell_type": "code",
881-
"execution_count": 22,
980+
"execution_count": 27,
882981
"metadata": {},
883982
"outputs": [
884983
{
@@ -887,7 +986,7 @@
887986
"0.9736842105263158"
888987
]
889988
},
890-
"execution_count": 22,
989+
"execution_count": 27,
891990
"metadata": {},
892991
"output_type": "execute_result"
893992
}
@@ -900,13 +999,13 @@
900999
"classifier = DecisionTreeClassifier()\n",
9011000
"\n",
9021001
"# Training the model\n",
903-
"classifier.fit(X_train_transformed, y_train)\n",
1002+
"classifier.fit(X_train_transformed, y_train_transformed)\n",
9041003
"\n",
9051004
"# Prediction on unseen data\n",
9061005
"y_test_pred = classifier.predict(X_test_transformed)\n",
9071006
"\n",
9081007
"# Evaluation\n",
909-
"metrics.accuracy_score(y_test, y_test_pred)"
1008+
"metrics.accuracy_score(y_test_transformed, y_test_pred)"
9101009
]
9111010
},
9121011
{
@@ -918,7 +1017,7 @@
9181017
},
9191018
{
9201019
"cell_type": "code",
921-
"execution_count": 23,
1020+
"execution_count": 28,
9221021
"metadata": {},
9231022
"outputs": [
9241023
{
@@ -927,7 +1026,7 @@
9271026
"0.9736842105263158"
9281027
]
9291028
},
930-
"execution_count": 23,
1029+
"execution_count": 28,
9311030
"metadata": {},
9321031
"output_type": "execute_result"
9331032
}
@@ -940,13 +1039,13 @@
9401039
"classifier = RandomForestClassifier()\n",
9411040
"\n",
9421041
"# Training the model\n",
943-
"classifier.fit(X_train_transformed, y_train)\n",
1042+
"classifier.fit(X_train_transformed, y_train_transformed)\n",
9441043
"\n",
9451044
"# Prediction on unseen data\n",
9461045
"y_test_pred = classifier.predict(X_test_transformed)\n",
9471046
"\n",
9481047
"# Evaluation\n",
949-
"metrics.accuracy_score(y_test, y_test_pred)"
1048+
"metrics.accuracy_score(y_test_transformed, y_test_pred)"
9501049
]
9511050
}
9521051
],

0 commit comments

Comments
 (0)