Hypothesis Testing PDF
Hypothesis Testing PDF
The test is applied when you have two categorical variables from a single population. It is used to determine whether there is a significant
association between the two variables.
dataset.head()
dataset_table=pd.crosstab(dataset['sex'],dataset['smoker'])
print(dataset_table)
smoker Yes No
sex
Male 60 97
Female 33 54
dataset_table.values
array([[60, 97],
[33, 54]], dtype=int64)
#Observed Values
Observed_Values = dataset_table.values
print("Observed Values :-\n",Observed_Values)
Observed Values :-
[[60 97]
[33 54]]
val=stats.chi2_contingency(dataset_table)
val
(0.008763290531773594,
0.925417020494423,
1,
array([[59.84016393, 97.15983607],
[33.15983607, 53.84016393]]))
Expected_Values=val[3]
no_of_rows=len(dataset_table.iloc[0:2,0])
no_of_columns=len(dataset_table.iloc[0,0:2])
ddof=(no_of_rows-1)*(no_of_columns-1)
print("Degree of Freedom:-",ddof)
alpha = 0.05
Degree of Freedom:- 1
print("chi-square statistic:-",chi_square_statistic)
critical_value: 3.841458820694124
#p-value
p_value=1-chi2.cdf(x=chi_square_statistic,df=ddof)
print('p-value:',p_value)
print('Significance level: ',alpha)
print('Degree of Freedom: ',ddof)
print('p-value:',p_value)
p-value: 0.964915107315732
Significance level: 0.05
Degree of Freedom: 1
p-value: 0.964915107315732
if chi_square_statistic>=critical_value:
print("Reject H0,There is a relationship between 2 categorical variables")
else:
print("Retain H0,There is no relationship between 2 categorical variables")
if p_value<=alpha:
print("Reject H0,There is a relationship between 2 categorical variables")
else:
print("Retain H0,There is no relationship between 2 categorical variables")
T Test
A t-test is a type of inferential statistic which is used to determine if there is a significant difference between the means of two groups
which may be related in certain features
ages=[10,20,35,50,28,40,55,18,16,55,30,25,43,18,30,28,14,24,16,17,32,35,26,27,65,18,43,23,21,20,19,70]
len(ages)
32
import numpy as np
ages_mean=np.mean(ages)
print(ages_mean)
30.34375
age_sample
np.mean(age_sample)
31.0
ttest,p_value=ttest_1samp(age_sample,30)
print(p_value)
0.7681189381229006
import numpy as np
import pandas as pd
import scipy.stats as stats
import math
np.random.seed(6)
school_ages=stats.poisson.rvs(loc=18,mu=35,size=1500)
classA_ages=stats.poisson.rvs(loc=18,mu=30,size=25)
np.mean(school_ages)
53.303333333333335
classA_ages.mean()
48.2
_,p_value=stats.ttest_1samp(a=classA_ages,popmean=school_ages.mean())
p_value
3.26936314797003e-05
school_ages.mean()
53.303333333333335
50.63333333333333
_,p_value=stats.ttest_ind(a=classA_ages,b=ClassB_ages,equal_var=False)
p_value
0.06021969607248894
weight1=[25,30,28,35,28,34,26,29,30,26,28,32,31,30,45]
weight2=weight1+stats.norm.rvs(scale=5,loc=-1.25,size=15)
print(weight1)
print(weight2)
[25, 30, 28, 35, 28, 34, 26, 29, 30, 26, 28, 32, 31, 30, 45]
[30.57926457 34.91022437 29.00444617 30.54295091 19.86201983 37.57873174
18.3299827 21.3771395 36.36420881 32.05941216 26.93827982 29.519014
26.42851213 30.50667769 41.32984284]
weight_df=pd.DataFrame({"weight_10":np.array(weight1),
"weight_20":np.array(weight2),
"weight_change":np.array(weight2)-np.array(weight1)})
weight_df
0 25 30.579265 5.579265
1 30 34.910224 4.910224
2 28 29.004446 1.004446
3 35 30.542951 -4.457049
4 28 19.862020 -8.137980
5 34 37.578732 3.578732
6 26 18.329983 -7.670017
7 29 21.377139 -7.622861
8 30 36.364209 6.364209
9 26 32.059412 6.059412
10 28 26.938280 -1.061720
11 32 29.519014 -2.480986
12 31 26.428512 -4.571488
13 30 30.506678 0.506678
14 45 41.329843 -3.670157
_,p_value=stats.ttest_rel(a=weight1,b=weight2)
print(p_value)
0.5732936534411279
Correlation
import seaborn as sns
df=sns.load_dataset('iris')
df.shape
(150, 5)
df.corr()
sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x29595f8ea60>
Anova Test(F-Test)
The t-test works well when dealing with two groups, but sometimes we want to compare more than two groups at the same time.
For example, if we wanted to test whether petal_width age differs based on some categorical variable like species, we have to compare
the means of each level or group the variable
Example : there are 3 different category of iris flowers and their petal width and need to check whether all 3 group are similar or not
df1.head()
df_anova = df1[['petal_width','species']]
grps = pd.unique(df_anova.species.values)
grps
d_data
{'setosa': 0 0.2
1 0.2
2 0.2
3 0.2
4 0.2
5 0.4
6 0.3
7 0.2
8 0.2
9 0.1
10 0.2
11 0.2
12 0.1
13 0.1
14 0.2
15 0.4
16 0.4
17 0.3
18 0.3
19 0.3
20 0.2
21 0.4
22 0.2
23 0.5
24 0.2
25 0.2
26 0.4
27 0.2
28 0.2
29 0.2
30 0.2
31 0.4
32 0.1
33 0.2
34 0.2
35 0.2
36 0.2
37 0.1
38 0.2
39 0.2
40 0.3
41 0.3
42 0.2
43 0.6
44 0.4
45 0.3
46 0.2
47 0.2
48 0.2
49 0.2
Name: petal_width, dtype: float64,
'versicolor': 50 1.4
51 1.5
52 1.5
53 1.3
54 1.5
55 1.3
56 1.6
57 1.0
58 1.3
59 1.4
60 1.0
61 1.5
62 1.0
63 1.4
64 1.3
65 1.4
66 1.5
67 1.0
68 1.5
69 1.1
70 1.8
71 1.3
72 1.5
73 1.2
74 1.3
75 1.4
76 1.4
77 1.7
78 1.5
79 1.0
80 1.1
81 1.0
82 1.2
83 1.6
84 1.5
85 1.6
86 1.5
87 1.3
88 1.3
89 1.3
90 1.2
91 1.4
92 1.2
93 1.0
94 1.3
95 1.2
96 1.3
97 1.3
98 1.1
99 1.3
Name: petal_width, dtype: float64,
'virginica': 100 2.5
101 1.9
102 2.1
103 1.8
104 2.2
105 2.1
106 1.7
107 1.8
108 1.8
109 2.5
110 2.0
111 1.9
112 2.1
113 2.0
114 2.4
115 2.3
116 1.8
117 2.2
118 2.3
119 1.5
120 2.3
121 2.0
122 2.0
123 1.8
124 2.1
125 1.8
126 1.8
127 1.8
128 2.1
129 1.6
130 1.9
131 2.0
132 2.2
133 1.5
134 1.4
135 2.3
136 2.4
137 1.8
138 1.8
139 2.1
140 2.4
141 2.3
142 1.9
143 2.3
144 2.5
145 2.3
146 1.9
147 2.0
148 2.3
149 1.8
Name: petal_width, dtype: float64}
print(p)
4.169445839443116e-85
if p<0.05:
print("reject null hypothesis")
else:
print("accept null hypothesis")
# imports
import math
import numpy as np
from numpy.random import randn
from statsmodels.stats.weightstats import ztest
mean=109.61 stdv=2.22
# now we perform the test. In this function, we passed data, in the value parameter
# we passed mean value in the null hypothesis, in alternative hypothesis we check whether the
# mean is larger
df.head()
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... ScreenPorch PoolArea PoolQC Fence
0 1.0 60.0 RL 65.0 8450.0 Pave None Reg Lvl AllPub ... 0.0 0.0 None None
1 2.0 20.0 RL 80.0 9600.0 Pave None Reg Lvl AllPub ... 0.0 0.0 None None
2 3.0 60.0 RL 68.0 11250.0 Pave None IR1 Lvl AllPub ... 0.0 0.0 None None
3 4.0 70.0 RL 60.0 9550.0 Pave None IR1 Lvl AllPub ... 0.0 0.0 None None
4 5.0 60.0 RL 84.0 14260.0 Pave None IR1 Lvl AllPub ... 0.0 0.0 None None
5 rows × 80 columns
import seaborn as sns
sns.load_dataset('titanic')
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 0 2 male 27.0 0 0 13.0000 S Second man True NaN Southampton no True
887 1 1 female 19.0 0 0 30.0000 S First woman False B Southampton yes True
888 0 3 female NaN 1 2 23.4500 S Third woman False NaN Southampton no False
889 1 1 male 26.0 0 0 30.0000 C First man True C Cherbourg yes True
890 0 3 male 32.0 0 0 7.7500 Q Third man True NaN Queenstown no True