Skip to content

Commit baca951

Browse files
authored
Add files via upload
1 parent 5f24b29 commit baca951

File tree

3 files changed

+283
-0
lines changed

3 files changed

+283
-0
lines changed
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import numpy as np
2+
import pandas as pd
3+
import scipy.stats as stats
4+
5+
national = pd.DataFrame(["white"] * 100000 + ["hispanic"] * 60000 + \
6+
["black"] * 50000 + ["asian"] * 15000 + ["other"] * 35000)
7+
8+
minnesota = pd.DataFrame(["white"] * 600 + ["hispanic"] * 300 + \
9+
["black"] * 250 + ["asian"] * 75 + ["other"] * 150)
10+
11+
national_table = pd.crosstab(index=national[0], columns="count")
12+
minnesota_table = pd.crosstab(index=minnesota[0], columns="count")
13+
14+
print("National")
15+
print(national_table)
16+
print(" ")
17+
print("Minnesota")
18+
print(minnesota_table)
19+
20+
observed = minnesota_table
21+
22+
national_ratios = national_table/len(national) # Get population ratios
23+
24+
expected = national_ratios * len(minnesota) # Get expected counts
25+
26+
chi_squared_stat = (((observed-expected)**2)/expected).sum()
27+
28+
print(chi_squared_stat)
29+
30+
31+
crit = stats.chi2.ppf(q = 0.95, # Find the critical value for 95% confidence*
32+
df = 4) # Df = number of variable categories - 1
33+
34+
print("Critical value")
35+
print(crit)
36+
37+
p_value = 1 - stats.chi2.cdf(x=chi_squared_stat, # Find the p-value
38+
df=4)
39+
print("P value")
40+
print(p_value)
41+
42+
stats.chisquare(f_obs= observed, # Array of observed counts
43+
f_exp= expected) # Array of expected counts
44+
45+
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
{
2+
"nbformat": 4,
3+
"nbformat_minor": 0,
4+
"metadata": {
5+
"colab": {
6+
"name": "Z-Test.ipynb",
7+
"version": "0.3.2",
8+
"provenance": []
9+
},
10+
"kernelspec": {
11+
"name": "python3",
12+
"display_name": "Python 3"
13+
}
14+
},
15+
"cells": [
16+
{
17+
"cell_type": "code",
18+
"metadata": {
19+
"id": "AfRng3KPuSBQ",
20+
"colab_type": "code",
21+
"colab": {
22+
"base_uri": "https://localhost:8080/",
23+
"height": 51
24+
},
25+
"outputId": "01b490f3-42b4-410f-ff53-bd9bf782e0d6"
26+
},
27+
"source": [
28+
"def twoSampZ(X1, X2, mudiff, sd1, sd2, n1, n2):\n",
29+
" from numpy import sqrt, abs, round\n",
30+
" from scipy.stats import norm\n",
31+
" pooledSE = sqrt(sd1**2/n1 + sd2**2/n2)\n",
32+
" z = ((X1 - X2) - mudiff)/pooledSE\n",
33+
" pval = 2*(1 - norm.cdf(abs(z)))\n",
34+
" return round(z, 3), round(pval, 4)\n",
35+
"\n",
36+
"\n",
37+
"\n",
38+
"z, p = twoSampZ(28, 33, 0, 14.1, 9.5, 75, 50)\n",
39+
"print(\"Z Score:\",z)\n",
40+
"print(\"P-Value:\",p)"
41+
],
42+
"execution_count": 2,
43+
"outputs": [
44+
{
45+
"output_type": "stream",
46+
"text": [
47+
"Z Score: -2.369\n",
48+
"P-Value: 0.0179\n"
49+
],
50+
"name": "stdout"
51+
}
52+
]
53+
}
54+
]
55+
}
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
{
2+
"nbformat": 4,
3+
"nbformat_minor": 0,
4+
"metadata": {
5+
"colab": {
6+
"name": "t-Test.ipynb",
7+
"version": "0.3.2",
8+
"provenance": []
9+
},
10+
"kernelspec": {
11+
"name": "python3",
12+
"display_name": "Python 3"
13+
}
14+
},
15+
"cells": [
16+
{
17+
"cell_type": "code",
18+
"metadata": {
19+
"id": "Y21N_2yv3Grl",
20+
"colab_type": "code",
21+
"colab": {}
22+
},
23+
"source": [
24+
"## Import the packages\n",
25+
"import numpy as np\n",
26+
"from scipy import stats"
27+
],
28+
"execution_count": 0,
29+
"outputs": []
30+
},
31+
{
32+
"cell_type": "code",
33+
"metadata": {
34+
"id": "Aga_0SM43OdO",
35+
"colab_type": "code",
36+
"colab": {
37+
"base_uri": "https://localhost:8080/",
38+
"height": 85
39+
},
40+
"outputId": "8d4b8f3d-a6df-4129-b3b6-867144288009"
41+
},
42+
"source": [
43+
"## Define 2 random distributions\n",
44+
"\n",
45+
"#Sample Size\n",
46+
"N = 10\n",
47+
"\n",
48+
"#Gaussian distributed data with mean = 2 and var = 1\n",
49+
"a = np.random.randn(N) + 2\n",
50+
"print(a)\n",
51+
"\n",
52+
"#Gaussian distributed data with with mean = 0 and var = 1\n",
53+
"b = np.random.randn(N)\n",
54+
"print(b)"
55+
],
56+
"execution_count": 6,
57+
"outputs": [
58+
{
59+
"output_type": "stream",
60+
"text": [
61+
"[3.41987841 2.4642942 1.3074381 1.88900262 1.5018451 2.08785958\n",
62+
" 4.18763608 2.76111147 1.25673154 1.22916177]\n",
63+
"[ 0.09625918 -0.426427 -0.81593085 -0.27386856 -0.19758738 0.71729565\n",
64+
" -0.44211666 0.07106772 -0.53144206 -0.21403634]\n"
65+
],
66+
"name": "stdout"
67+
}
68+
]
69+
},
70+
{
71+
"cell_type": "code",
72+
"metadata": {
73+
"id": "DGw_0SoQ2Uhj",
74+
"colab_type": "code",
75+
"colab": {
76+
"base_uri": "https://localhost:8080/",
77+
"height": 51
78+
},
79+
"outputId": "b6caa6b7-64df-44e7-b626-13fb8165baeb"
80+
},
81+
"source": [
82+
"## Calculate the Standard Deviation\n",
83+
"\n",
84+
"#Calculate the variance to get the standard deviation\n",
85+
"\n",
86+
"#For unbiased max likelihood estimate we have to divide the var by N-1, and therefore the parameter ddof = 1\n",
87+
"var_a = a.var(ddof=1)\n",
88+
"var_b = b.var(ddof=1)\n",
89+
"\n",
90+
"#std deviation\n",
91+
"s = np.sqrt((var_a + var_b)/2)\n",
92+
"\n",
93+
"print(\"Std Deviation:\", s)\n",
94+
"\n",
95+
"## Calculate the t-statistics\n",
96+
"t = (a.mean() - b.mean())/(s*np.sqrt(2/N))\n",
97+
"\n",
98+
"print(\"T-value:\", t)"
99+
],
100+
"execution_count": 8,
101+
"outputs": [
102+
{
103+
"output_type": "stream",
104+
"text": [
105+
"Std Deviation: 0.7693967525636721\n",
106+
"T-value: 7.0104093570005945\n"
107+
],
108+
"name": "stdout"
109+
}
110+
]
111+
},
112+
{
113+
"cell_type": "code",
114+
"metadata": {
115+
"id": "9atPC3HO3Z2U",
116+
"colab_type": "code",
117+
"colab": {
118+
"base_uri": "https://localhost:8080/",
119+
"height": 51
120+
},
121+
"outputId": "683ac7bd-8bd8-4e55-d3fc-7942748b66d1"
122+
},
123+
"source": [
124+
"## Compare with the critical t-value\n",
125+
"\n",
126+
"#Degrees of freedom\n",
127+
"df = 2*N - 2\n",
128+
"\n",
129+
"#p-value after comparison with the t\n",
130+
"p = 1 - stats.t.cdf(t,df=df)\n",
131+
"\n",
132+
"print(\"t-Score = \" + str(t))\n",
133+
"print(\"p-Value = \" + str(2*p))\n",
134+
"\n",
135+
"#Note that we multiply the p value by 2 because its a twp tail t-test\n",
136+
"\n",
137+
"### You can see that after comparing the t statistic with the critical t value (computed internally)\n",
138+
"# we get a good p value of 0.0005 and thus we reject the null hypothesis and thus it proves that the mean\n",
139+
"# of the two distributions are different and statistically significant."
140+
],
141+
"execution_count": 9,
142+
"outputs": [
143+
{
144+
"output_type": "stream",
145+
"text": [
146+
"t-Score = 7.0104093570005945\n",
147+
"p-Value = 1.522899394812427e-06\n"
148+
],
149+
"name": "stdout"
150+
}
151+
]
152+
},
153+
{
154+
"cell_type": "code",
155+
"metadata": {
156+
"id": "I_ve3N6a3Mlo",
157+
"colab_type": "code",
158+
"colab": {
159+
"base_uri": "https://localhost:8080/",
160+
"height": 51
161+
},
162+
"outputId": "cc8bcc64-e1a2-4c05-98b9-db0cf91a0a01"
163+
},
164+
"source": [
165+
"## Cross Checking with the internal scipy function\n",
166+
"t2, p2 = stats.ttest_ind(a,b)\n",
167+
"print(\"t = \" + str(t2))\n",
168+
"print(\"p = \" + str(2*p2))"
169+
],
170+
"execution_count": 10,
171+
"outputs": [
172+
{
173+
"output_type": "stream",
174+
"text": [
175+
"t = 7.010409357000594\n",
176+
"p = 3.045798789679482e-06\n"
177+
],
178+
"name": "stdout"
179+
}
180+
]
181+
}
182+
]
183+
}

0 commit comments

Comments
 (0)