Implement chi2

2022-10-13 13:25:24 +11:00 · 2022-10-13 13:25:24 +11:00 · 7e8418eb36
commit 7e8418eb36
parent edc82c1658
3 changed files with 141 additions and 1 deletions
--- a/tests/test_chi2.py
+++ b/tests/test_chi2.py
@ -0,0 +1,69 @@
 #   scipy-yli: Helpful SciPy utilities and recipes
 #   Copyright © 2022  Lee Yingtong Li (RunasSudo)
 #
 #   This program is free software: you can redistribute it and/or modify
 #   it under the terms of the GNU Affero General Public License as published by
 #   the Free Software Foundation, either version 3 of the License, or
 #   (at your option) any later version.
 #
 #   This program is distributed in the hope that it will be useful,
 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #   GNU Affero General Public License for more details.
 #
 #   You should have received a copy of the GNU Affero General Public License
 #   along with this program.  If not, see <https://www.gnu.org/licenses/>.
 from pytest import approx
 import numpy as np
 import pandas as pd
 import yli
 def test_chi2_ol10_15():
 	"""Compare yli.chi2 for Ott & Longnecker (2016) example 10.15"""
 	data = [
 		(1, 'Moderate', 15),
 		(2, 'Moderate', 32),
 		(3, 'Moderate', 18),
 		(4, 'Moderate', 5),
 		(1, 'Mildly Severe', 8),
 		(2, 'Mildly Severe', 29),
 		(3, 'Mildly Severe', 23),
 		(4, 'Mildly Severe', 18),
 		(1, 'Severe', 1),
 		(2, 'Severe', 20),
 		(3, 'Severe', 25),
 		(4, 'Severe', 22)
 	]
 	df = pd.DataFrame({
 		'AgeCategory': np.repeat([d[0] for d in data], [d[2] for d in data]),
 		'Severity': np.repeat([d[1] for d in data], [d[2] for d in data])
 	})
 	result = yli.chi2(df, 'Severity', 'AgeCategory')
 	assert result.statistic == approx(27.13, abs=0.01)
 	assert result.pvalue == approx(0.00014, abs=0.00001)
 def test_chi2_ol10_18():
 	"""Compare yli.chi2 for Ott & Longnecker (2016) example 10.18"""
 	data = [
 		(False, False, 250),
 		(True, False, 750),
 		(False, True, 400),
 		(True, True, 1600)
 	]
 	df = pd.DataFrame({
 		'Response': np.repeat([d[0] for d in data], [d[2] for d in data]),
 		'Stress': np.repeat([d[1] for d in data], [d[2] for d in data])
 	})
 	result = yli.chi2(df, 'Stress', 'Response')
 	assert result.oddsratio.point == approx(1.333, abs=0.001)
 	assert result.oddsratio.ci_lower == approx(1.113, abs=0.001)
 	assert result.oddsratio.ci_upper == approx(1.596, abs=0.001)
--- a/yli/init.py
+++ b/yli/init.py
@ -15,7 +15,7 @@
 #   along with this program.  If not, see <https://www.gnu.org/licenses/>.
 from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
-from .sig_tests import mannwhitney, ttest_ind
+from .sig_tests import chi2, mannwhitney, ttest_ind
 def reload_me():
 	import importlib
--- a/yli/sig_tests.py
+++ b/yli/sig_tests.py
@ -14,6 +14,7 @@
 #   You should have received a copy of the GNU Affero General Public License
 #   along with this program.  If not, see <https://www.gnu.org/licenses/>.
 import numpy as np
 import pandas as pd
 from scipy import stats
 import statsmodels.api as sm
@ -155,3 +156,73 @@ def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_cont
 		statistic=min(u1, u2), pvalue=result.pvalue,
 		#med1=data1.median(), med2=data2.median(),
 		rank_biserial=r, direction=('{1} > {0}' if u1 < u2 else '{0} > {1}').format(group1, group2))
 # ------------------------
 # Pearson chi-squared test
 class PearsonChiSquaredResult:
 	"""Result of a Pearson chi-squared test"""
 	def __init__(self, ct, statistic, dof, pvalue, oddsratio=None, riskratio=None):
 		self.ct = ct
 		self.statistic = statistic
 		self.dof = dof
 		self.pvalue = pvalue
 		self.oddsratio = oddsratio
 		self.riskratio = riskratio
 	def _repr_html_(self):
 		if self.oddsratio is not None:
 			return '{}<br><i>χ</i><sup>2</sup>({}) = {:.2f}; <i>p</i> {}<br>OR (95% CI) = {}<br>RR (95% CI) = {}'.format(
 				self.ct._repr_html_(), self.dof, self.statistic, fmt_p_html(self.pvalue), self.oddsratio.summary(), self.riskratio.summary())
 		else:
 			return '{}<br><i>χ</i><sup>2</sup>({}) = {:.2f}; <i>p</i> {}'.format(
 				self.ct._repr_html_(), self.dof, self.statistic, fmt_p_html(self.pvalue))
 	def summary(self):
 		if self.oddsratio is not None:
 			return '{}\nχ²({}) = {:.2f}; p {}\nOR (95% CI) = {}\nRR (95% CI) = {}'.format(
 				self.ct, self.dof, self.statistic, fmt_p_text(self.pvalue), self.oddsratio.summary(), self.riskratio.summary())
 		else:
 			return '{}\nχ²({}) = {:.2f}; p {}'.format(
 				self.ct, self.dof, self.statistic, fmt_p_text(self.pvalue))
 def chi2(df, dep, ind, *, nan_policy='warn'):
 	"""
 	Perform a Pearson chi-squared test
 	"""
 	# Check for/clean NaNs
 	df = check_nan(df[[ind, dep]], nan_policy)
 	# Compute contingency table
 	ct = pd.crosstab(df[ind], df[dep])
 	# Get expected counts
 	expected = stats.contingency.expected_freq(ct)
 	# Warn on low expected counts
 	if (expected < 5).sum() / expected.size > 0.2:
 		warnings.warn('{} of {} cells ({:.0f}%) have expected count < 5'.format((expected < 5).sum(), expected.size, (expected < 5).sum() / expected.size * 100))
 	if (expected < 1).any():
 		warnings.warn('{} cells have expected count < 1'.format((expected < 1).sum()))
 	if ct.shape == (2,2):
 		# 2x2 table
 		# Use statsmodels to get OR andRR
 		smct = sm.stats.Table2x2(np.flip(ct.to_numpy()), shift_zeros=False)
 		result = smct.test_nominal_association()
 		ORci = smct.oddsratio_confint()
 		RRci = smct.riskratio_confint()
 		return PearsonChiSquaredResult(
 			ct=ct, statistic=result.statistic, dof=result.df, pvalue=result.pvalue,
 			oddsratio=Estimate(smct.oddsratio, ORci[0], ORci[1]), riskratio=Estimate(smct.riskratio, RRci[0], RRci[1]))
 	else:
 		# rxc table
 		# Just use SciPy
 		result = stats.chi2_contingency(ct, correction=False)
 		return PearsonChiSquaredResult(ct=ct, statistic=result[0], dof=result[2], pvalue=result[1])