suneelpatel
diff --git a/‎Statistical_Testing/Non-Parametric_Tests/Chi-Square_Test.py‎
Lines changed: 45 additions & 0 deletions b/‎Statistical_Testing/Non-Parametric_Tests/Chi-Square_Test.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎Statistical_Testing/Parametric_Tests/Z_Test.ipynb‎
Lines changed: 55 additions & 0 deletions b/‎Statistical_Testing/Parametric_Tests/Z_Test.ipynb‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎Statistical_Testing/Parametric_Tests/t_Test.ipynb‎
Lines changed: 183 additions & 0 deletions b/‎Statistical_Testing/Parametric_Tests/t_Test.ipynb‎
Lines changed: 183 additions & 0 deletions
@@ -0,0 +1,45 @@
+import numpy as np
+import pandas as pd
+import scipy.stats as stats
+
+national = pd.DataFrame(["white"] * 100000 + ["hispanic"] * 60000 + \
+ ["black"] * 50000 + ["asian"] * 15000 + ["other"] * 35000)
+
+minnesota = pd.DataFrame(["white"] * 600 + ["hispanic"] * 300 + \
+ ["black"] * 250 + ["asian"] * 75 + ["other"] * 150)
+
+national_table = pd.crosstab(index=national[0], columns="count")
+minnesota_table = pd.crosstab(index=minnesota[0], columns="count")
+
+print("National")
+print(national_table)
+print(" ")
+print("Minnesota")
+print(minnesota_table)
+
+observed = minnesota_table
+
+national_ratios = national_table/len(national) # Get population ratios
+
+expected = national_ratios * len(minnesota) # Get expected counts
+
+chi_squared_stat = (((observed-expected)**2)/expected).sum()
+
+print(chi_squared_stat)
+
+
+crit = stats.chi2.ppf(q = 0.95, # Find the critical value for 95% confidence*
+ df = 4) # Df = number of variable categories - 1
+
+print("Critical value")
+print(crit)
+
+p_value = 1 - stats.chi2.cdf(x=chi_squared_stat, # Find the p-value
+ df=4)
+print("P value")
+print(p_value)
+
+stats.chisquare(f_obs= observed, # Array of observed counts
+ f_exp= expected) # Array of expected counts
+
+
@@ -0,0 +1,55 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "name": "Z-Test.ipynb",
+ "version": "0.3.2",
+ "provenance": []
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "AfRng3KPuSBQ",
+ "colab_type": "code",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 51
+ },
+ "outputId": "01b490f3-42b4-410f-ff53-bd9bf782e0d6"
+ },
+ "source": [
+ "def twoSampZ(X1, X2, mudiff, sd1, sd2, n1, n2):\n",
+ " from numpy import sqrt, abs, round\n",
+ " from scipy.stats import norm\n",
+ " pooledSE = sqrt(sd1**2/n1 + sd2**2/n2)\n",
+ " z = ((X1 - X2) - mudiff)/pooledSE\n",
+ " pval = 2*(1 - norm.cdf(abs(z)))\n",
+ " return round(z, 3), round(pval, 4)\n",
+ "\n",
+ "\n",
+ "\n",
+ "z, p = twoSampZ(28, 33, 0, 14.1, 9.5, 75, 50)\n",
+ "print(\"Z Score:\",z)\n",
+ "print(\"P-Value:\",p)"
+ ],
+ "execution_count": 2,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "text": [
+ "Z Score: -2.369\n",
+ "P-Value: 0.0179\n"
+ ],
+ "name": "stdout"
+ }
+ ]
+ }
+ ]
+}
@@ -0,0 +1,183 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "name": "t-Test.ipynb",
+ "version": "0.3.2",
+ "provenance": []
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "Y21N_2yv3Grl",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "## Import the packages\n",
+ "import numpy as np\n",
+ "from scipy import stats"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "Aga_0SM43OdO",
+ "colab_type": "code",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 85
+ },
+ "outputId": "8d4b8f3d-a6df-4129-b3b6-867144288009"
+ },
+ "source": [
+ "## Define 2 random distributions\n",
+ "\n",
+ "#Sample Size\n",
+ "N = 10\n",
+ "\n",
+ "#Gaussian distributed data with mean = 2 and var = 1\n",
+ "a = np.random.randn(N) + 2\n",
+ "print(a)\n",
+ "\n",
+ "#Gaussian distributed data with with mean = 0 and var = 1\n",
+ "b = np.random.randn(N)\n",
+ "print(b)"
+ ],
+ "execution_count": 6,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "text": [
+ "[3.41987841 2.4642942 1.3074381 1.88900262 1.5018451 2.08785958\n",
+ " 4.18763608 2.76111147 1.25673154 1.22916177]\n",
+ "[ 0.09625918 -0.426427 -0.81593085 -0.27386856 -0.19758738 0.71729565\n",
+ " -0.44211666 0.07106772 -0.53144206 -0.21403634]\n"
+ ],
+ "name": "stdout"
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "DGw_0SoQ2Uhj",
+ "colab_type": "code",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 51
+ },
+ "outputId": "b6caa6b7-64df-44e7-b626-13fb8165baeb"
+ },
+ "source": [
+ "## Calculate the Standard Deviation\n",
+ "\n",
+ "#Calculate the variance to get the standard deviation\n",
+ "\n",
+ "#For unbiased max likelihood estimate we have to divide the var by N-1, and therefore the parameter ddof = 1\n",
+ "var_a = a.var(ddof=1)\n",
+ "var_b = b.var(ddof=1)\n",
+ "\n",
+ "#std deviation\n",
+ "s = np.sqrt((var_a + var_b)/2)\n",
+ "\n",
+ "print(\"Std Deviation:\", s)\n",
+ "\n",
+ "## Calculate the t-statistics\n",
+ "t = (a.mean() - b.mean())/(s*np.sqrt(2/N))\n",
+ "\n",
+ "print(\"T-value:\", t)"
+ ],
+ "execution_count": 8,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "text": [
+ "Std Deviation: 0.7693967525636721\n",
+ "T-value: 7.0104093570005945\n"
+ ],
+ "name": "stdout"
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "9atPC3HO3Z2U",
+ "colab_type": "code",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 51
+ },
+ "outputId": "683ac7bd-8bd8-4e55-d3fc-7942748b66d1"
+ },
+ "source": [
+ "## Compare with the critical t-value\n",
+ "\n",
+ "#Degrees of freedom\n",
+ "df = 2*N - 2\n",
+ "\n",
+ "#p-value after comparison with the t\n",
+ "p = 1 - stats.t.cdf(t,df=df)\n",
+ "\n",
+ "print(\"t-Score = \" + str(t))\n",
+ "print(\"p-Value = \" + str(2*p))\n",
+ "\n",
+ "#Note that we multiply the p value by 2 because its a twp tail t-test\n",
+ "\n",
+ "### You can see that after comparing the t statistic with the critical t value (computed internally)\n",
+ "# we get a good p value of 0.0005 and thus we reject the null hypothesis and thus it proves that the mean\n",
+ "# of the two distributions are different and statistically significant."
+ ],
+ "execution_count": 9,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "text": [
+ "t-Score = 7.0104093570005945\n",
+ "p-Value = 1.522899394812427e-06\n"
+ ],
+ "name": "stdout"
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "I_ve3N6a3Mlo",
+ "colab_type": "code",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 51
+ },
+ "outputId": "cc8bcc64-e1a2-4c05-98b9-db0cf91a0a01"
+ },
+ "source": [
+ "## Cross Checking with the internal scipy function\n",
+ "t2, p2 = stats.ttest_ind(a,b)\n",
+ "print(\"t = \" + str(t2))\n",
+ "print(\"p = \" + str(2*p2))"
+ ],
+ "execution_count": 10,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "text": [
+ "t = 7.010409357000594\n",
+ "p = 3.045798789679482e-06\n"
+ ],
+ "name": "stdout"
+ }
+ ]
+ }
+ ]
+}