Implement chi2

2022-10-13 13:25:24 +11:00 · 2022-10-13 13:25:24 +11:00 · 7e8418eb36
commit 7e8418eb36
parent edc82c1658
3 changed files with 141 additions and 1 deletions
--- a/tests/test_chi2.py
+++ b/tests/test_chi2.py
@ -0,0 +1,69 @@
+#   scipy-yli: Helpful SciPy utilities and recipes
+#   Copyright © 2022  Lee Yingtong Li (RunasSudo)
+#
+#   This program is free software: you can redistribute it and/or modify
+#   it under the terms of the GNU Affero General Public License as published by
+#   the Free Software Foundation, either version 3 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU Affero General Public License for more details.
+#
+#   You should have received a copy of the GNU Affero General Public License
+#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from pytest import approx
+
+import numpy as np
+import pandas as pd
+
+import yli
+
+def test_chi2_ol10_15():
+	"""Compare yli.chi2 for Ott & Longnecker (2016) example 10.15"""
+	
+	data = [
+		(1, 'Moderate', 15),
+		(2, 'Moderate', 32),
+		(3, 'Moderate', 18),
+		(4, 'Moderate', 5),
+		(1, 'Mildly Severe', 8),
+		(2, 'Mildly Severe', 29),
+		(3, 'Mildly Severe', 23),
+		(4, 'Mildly Severe', 18),
+		(1, 'Severe', 1),
+		(2, 'Severe', 20),
+		(3, 'Severe', 25),
+		(4, 'Severe', 22)
+	]
+	
+	df = pd.DataFrame({
+		'AgeCategory': np.repeat([d[0] for d in data], [d[2] for d in data]),
+		'Severity': np.repeat([d[1] for d in data], [d[2] for d in data])
+	})
+	
+	result = yli.chi2(df, 'Severity', 'AgeCategory')
+	assert result.statistic == approx(27.13, abs=0.01)
+	assert result.pvalue == approx(0.00014, abs=0.00001)
+
+def test_chi2_ol10_18():
+	"""Compare yli.chi2 for Ott & Longnecker (2016) example 10.18"""
+	
+	data = [
+		(False, False, 250),
+		(True, False, 750),
+		(False, True, 400),
+		(True, True, 1600)
+	]
+	
+	df = pd.DataFrame({
+		'Response': np.repeat([d[0] for d in data], [d[2] for d in data]),
+		'Stress': np.repeat([d[1] for d in data], [d[2] for d in data])
+	})
+	
+	result = yli.chi2(df, 'Stress', 'Response')
+	assert result.oddsratio.point == approx(1.333, abs=0.001)
+	assert result.oddsratio.ci_lower == approx(1.113, abs=0.001)
+	assert result.oddsratio.ci_upper == approx(1.596, abs=0.001)
--- a/yli/init.py
+++ b/yli/init.py
@ -15,7 +15,7 @@
 #   along with this program.  If not, see <https://www.gnu.org/licenses/>.

 from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
-from .sig_tests import mannwhitney, ttest_ind
+from .sig_tests import chi2, mannwhitney, ttest_ind

 def reload_me():
 	import importlib
--- a/yli/sig_tests.py
+++ b/yli/sig_tests.py
@ -14,6 +14,7 @@
 #   You should have received a copy of the GNU Affero General Public License
 #   along with this program.  If not, see <https://www.gnu.org/licenses/>.

+import numpy as np
 import pandas as pd
 from scipy import stats
 import statsmodels.api as sm
@ -155,3 +156,73 @@ def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_cont
 		statistic=min(u1, u2), pvalue=result.pvalue,
 		#med1=data1.median(), med2=data2.median(),
 		rank_biserial=r, direction=('{1} > {0}' if u1 < u2 else '{0} > {1}').format(group1, group2))
+
+# ------------------------
+# Pearson chi-squared test
+
+class PearsonChiSquaredResult:
+	"""Result of a Pearson chi-squared test"""
+	
+	def __init__(self, ct, statistic, dof, pvalue, oddsratio=None, riskratio=None):
+		self.ct = ct
+		self.statistic = statistic
+		self.dof = dof
+		self.pvalue = pvalue
+		self.oddsratio = oddsratio
+		self.riskratio = riskratio
+	
+	def _repr_html_(self):
+		if self.oddsratio is not None:
+			return '{}<br><i>χ</i><sup>2</sup>({}) = {:.2f}; <i>p</i> {}<br>OR (95% CI) = {}<br>RR (95% CI) = {}'.format(
+				self.ct._repr_html_(), self.dof, self.statistic, fmt_p_html(self.pvalue), self.oddsratio.summary(), self.riskratio.summary())
+		else:
+			return '{}<br><i>χ</i><sup>2</sup>({}) = {:.2f}; <i>p</i> {}'.format(
+				self.ct._repr_html_(), self.dof, self.statistic, fmt_p_html(self.pvalue))
+	
+	def summary(self):
+		if self.oddsratio is not None:
+			return '{}\nχ²({}) = {:.2f}; p {}\nOR (95% CI) = {}\nRR (95% CI) = {}'.format(
+				self.ct, self.dof, self.statistic, fmt_p_text(self.pvalue), self.oddsratio.summary(), self.riskratio.summary())
+		else:
+			return '{}\nχ²({}) = {:.2f}; p {}'.format(
+				self.ct, self.dof, self.statistic, fmt_p_text(self.pvalue))
+
+def chi2(df, dep, ind, *, nan_policy='warn'):
+	"""
+	Perform a Pearson chi-squared test
+	"""
+	
+	# Check for/clean NaNs
+	df = check_nan(df[[ind, dep]], nan_policy)
+	
+	# Compute contingency table
+	ct = pd.crosstab(df[ind], df[dep])
+	
+	# Get expected counts
+	expected = stats.contingency.expected_freq(ct)
+	
+	# Warn on low expected counts
+	if (expected < 5).sum() / expected.size > 0.2:
+		warnings.warn('{} of {} cells ({:.0f}%) have expected count < 5'.format((expected < 5).sum(), expected.size, (expected < 5).sum() / expected.size * 100))
+	if (expected < 1).any():
+		warnings.warn('{} cells have expected count < 1'.format((expected < 1).sum()))
+	
+	if ct.shape == (2,2):
+		# 2x2 table
+		# Use statsmodels to get OR andRR
+		
+		smct = sm.stats.Table2x2(np.flip(ct.to_numpy()), shift_zeros=False)
+		result = smct.test_nominal_association()
+		ORci = smct.oddsratio_confint()
+		RRci = smct.riskratio_confint()
+		
+		return PearsonChiSquaredResult(
+			ct=ct, statistic=result.statistic, dof=result.df, pvalue=result.pvalue,
+			oddsratio=Estimate(smct.oddsratio, ORci[0], ORci[1]), riskratio=Estimate(smct.riskratio, RRci[0], RRci[1]))
+	else:
+		# rxc table
+		# Just use SciPy
+		
+		result = stats.chi2_contingency(ct, correction=False)
+		
+		return PearsonChiSquaredResult(ct=ct, statistic=result[0], dof=result[2], pvalue=result[1])