-
Notifications
You must be signed in to change notification settings - Fork 1
/
chi_squared_test.py
36 lines (25 loc) · 945 Bytes
/
chi_squared_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# are category 1 and 2 independent?
import numpy as np
from scipy.stats import chi2, chi2_contingency
# axis 0 (rows) = category 1, axis 1 (cols) = category 2
data = np.array([[90, 60, 104, 95],
[30, 50, 51, 20],
[30, 40, 45, 35]])
# total number for each category 1
total_each1 = data.sum(axis=1)
total_each2 = data.sum(axis=0)
# total number of samples for each of category 1 and 2
total = np.sum(data)
# what do we expect for each sample in category 1 and 2 based on data
expected_each = total_each1[:, None] * total_each2[None, :] / total
# terms for chi^2
test_for_each_sample = (data - expected_each)**2 / expected_each
# chi^2 value
test_stat = np.sum(test_for_each_sample)
# number of degrees of freedom
ndofs = (data.shape[0] - 1) * (data.shape[1] - 1)
# p-value
p = 1 - chi2.cdf(test_stat, ndofs)
# confirm with scipy
p_scipy = chi2_contingency(data).pvalue
print(f"{p = } vs {p_scipy =}")