scipy-yli/yli/sig_tests.py

#   scipy-yli: Helpful SciPy utilities and recipes
#   Copyright © 2022  Lee Yingtong Li (RunasSudo)
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.

import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm

import functools
import warnings

from .utils import Estimate, as_2groups, check_nan, fmt_p

# ----------------
# Student's t test

class TTestResult:
	"""
	Result of a Student's t test
	
	delta: Mean difference
	"""
	
	def __init__(self, statistic, dof, pvalue, delta, delta_direction):
		self.statistic = statistic
		self.dof = dof
		self.pvalue = pvalue
		self.delta = delta
		self.delta_direction = delta_direction
	
	def _repr_html_(self):
		return '<i>t</i>({:.0f}) = {:.2f}; <i>p</i> {}<br><i>δ</i> (95% CI) = {}, {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, html=True), self.delta.summary(), self.delta_direction)
	
	def summary(self):
		return 't({:.0f}) = {:.2f}; p {}\nδ (95% CI) = {}, {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, html=False), self.delta.summary(), self.delta_direction)

def ttest_ind(df, dep, ind, *, nan_policy='warn'):
	"""Perform an independent-sample Student's t test"""
	
	# Check for/clean NaNs
	df = check_nan(df[[ind, dep]], nan_policy)
	
	# Ensure 2 groups for ind
	group1, data1, group2, data2 = as_2groups(df, dep, ind)
	
	# Do t test
	# Use statsmodels rather than SciPy because this provides the mean difference automatically
	d1 = sm.stats.DescrStatsW(data1)
	d2 = sm.stats.DescrStatsW(data2)
	
	cm = sm.stats.CompareMeans(d1, d2)
	statistic, pvalue, dof = cm.ttest_ind()
	
	delta = d1.mean - d2.mean
	ci0, ci1 = cm.tconfint_diff()
	
	# t test is symmetric so take absolute values
	return TTestResult(
		statistic=abs(statistic), dof=dof, pvalue=pvalue,
		delta=abs(Estimate(delta, ci0, ci1)),
		delta_direction=('{0} > {1}' if d1.mean > d2.mean else '{1} > {0}').format(group1, group2))

# -------------
# One-way ANOVA

class FTestResult:
	"""Result of an F test for ANOVA/regression"""
	
	def __init__(self, statistic, dof1, dof2, pvalue):
		self.statistic = statistic
		self.dof1 = dof1
		self.dof2 = dof2
		self.pvalue = pvalue
	
	def _repr_html_(self):
		return '<i>F</i>({}, {}) = {:.2f}; <i>p</i> {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=True))
	
	def summary(self):
		return 'F({}, {}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=False))

def anova_oneway(df, dep, ind, *, nan_policy='omit'):
	"""Perform one-way ANOVA"""
	
	# Check for/clean NaNs
	df = check_nan(df[[ind, dep]], nan_policy)
	
	# Group by independent variable
	groups = df.groupby(ind)[dep]
	
	# Perform one-way ANOVA
	result = stats.f_oneway(*[groups.get_group(k) for k in groups.groups])
	
	# See stats.f_oneway implementation
	dfbn = len(groups.groups) - 1
	dfwn = len(df) - len(groups.groups)
	
	return FTestResult(result.statistic, dfbn, dfwn, result.pvalue)

# -----------------
# Mann-Whitney test

class MannWhitneyResult:
	"""
	Result of a Mann-Whitney test
	
	brunnermunzel: BrunnerMunzelResult on same data
	"""
	
	def __init__(self, statistic, pvalue, rank_biserial, direction, brunnermunzel=None):
		self.statistic = statistic
		self.pvalue = pvalue
		self.rank_biserial = rank_biserial
		self.direction = direction
		self.brunnermunzel = brunnermunzel
	
	def _repr_html_(self):
		line1 = '<i>U</i> = {:.1f}; <i>p</i> {}<br><i>r</i> = {:.2f}, {}'.format(self.statistic, fmt_p(self.pvalue, html=True), self.rank_biserial, self.direction)
		if self.brunnermunzel:
			return line1 + '<br>' + self.brunnermunzel._repr_html_()
		else:
			return line1
	
	def summary(self):
		line1 = 'U = {:.1f}; p {}\nr = {}, {}'.format(self.statistic, fmt_p(self.pvalue, html=False), self.rank_biserial, self.direction)
		if self.brunnermunzel:
			return line1 + '\n' + self.brunnermunzel.summary()
		else:
			return line1

class BrunnerMunzelResult:
	"""Result of a Brunner-Munzel test"""
	
	def __init__(self, statistic, pvalue):
		self.statistic = statistic
		self.pvalue = pvalue
	
	def _repr_html_(self):
		return '<i>W</i> = {:.1f}; <i>p</i> {}'.format(self.statistic, fmt_p(self.pvalue, html=True))
	
	def summary(self):
		return 'W = {:.1f}; p {}'.format(self.statistic, fmt_p(self.pvalue, html=False))

def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_continuity=False, alternative='two-sided', method='auto'):
	"""
	Perform a Mann-Whitney test
	
	brunnermunzel: Set to False to skip the Brunner-Munzel test
	use_continuity, alternative, method: See scipy.stats.mannwhitneyu
	"""
	
	# Check for/clean NaNs
	df = check_nan(df[[ind, dep]], nan_policy)
	
	# Ensure 2 groups for ind
	group1, data1, group2, data2 = as_2groups(df, dep, ind)
	
	# Do Mann-Whitney test
	# Stata does not perform continuity correction
	result = stats.mannwhitneyu(data1, data2, use_continuity=use_continuity, alternative=alternative, method=method)
	
	u1 = result.statistic
	u2 = len(data1) * len(data2) - u1
	r = abs(2*u1 / (len(data1) * len(data2)) - 1)  # rank-biserial
	
	# If significant, perform a Brunner-Munzel test for our interest
	if result.pvalue < 0.05 and brunnermunzel:
		result_bm = stats.brunnermunzel(data1, data2)
		
		if result_bm.pvalue >= 0.05:
			warnings.warn('Mann-Whitney test is significant but Brunner-Munzel test is not. This could be due to a difference in shape, rather than location.')
			
			return MannWhitneyResult(
				statistic=min(u1, u2), pvalue=result.pvalue,
				#med1=data1.median(), med2=data2.median(),
				rank_biserial=r, direction=('{1} > {0}' if u1 < u2 else '{0} > {1}').format(group1, group2),
				brunnermunzel=BrunnerMunzelResult(statistic=result_bm.statistic, pvalue=result_bm.pvalue))
	
	return MannWhitneyResult(
		statistic=min(u1, u2), pvalue=result.pvalue,
		#med1=data1.median(), med2=data2.median(),
		rank_biserial=r, direction=('{1} > {0}' if u1 < u2 else '{0} > {1}').format(group1, group2))

# ------------------------
# Pearson chi-squared test

class PearsonChiSquaredResult:
	"""Result of a Pearson chi-squared test"""
	
	def __init__(self, ct, statistic, dof, pvalue, oddsratio=None, riskratio=None):
		self.ct = ct
		self.statistic = statistic
		self.dof = dof
		self.pvalue = pvalue
		self.oddsratio = oddsratio
		self.riskratio = riskratio
	
	def _repr_html_(self):
		if self.oddsratio is not None:
			return '{}<br><i>χ</i><sup>2</sup>({}) = {:.2f}; <i>p</i> {}<br>OR (95% CI) = {}<br>RR (95% CI) = {}'.format(
				self.ct._repr_html_(), self.dof, self.statistic, fmt_p(self.pvalue, html=True), self.oddsratio.summary(), self.riskratio.summary())
		else:
			return '{}<br><i>χ</i><sup>2</sup>({}) = {:.2f}; <i>p</i> {}'.format(
				self.ct._repr_html_(), self.dof, self.statistic, fmt_p(self.pvalue, html=True))
	
	def summary(self):
		if self.oddsratio is not None:
			return '{}\nχ²({}) = {:.2f}; p {}\nOR (95% CI) = {}\nRR (95% CI) = {}'.format(
				self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False), self.oddsratio.summary(), self.riskratio.summary())
		else:
			return '{}\nχ²({}) = {:.2f}; p {}'.format(
				self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False))

def chi2(df, dep, ind, *, nan_policy='warn'):
	"""Perform a Pearson chi-squared test"""
	
	# Check for/clean NaNs
	df = check_nan(df[[ind, dep]], nan_policy)
	
	# Compute contingency table
	ct = pd.crosstab(df[ind], df[dep])
	
	# Get expected counts
	expected = stats.contingency.expected_freq(ct)
	
	# Warn on low expected counts
	if (expected < 5).sum() / expected.size > 0.2:
		warnings.warn('{} of {} cells ({:.0f}%) have expected count < 5'.format((expected < 5).sum(), expected.size, (expected < 5).sum() / expected.size * 100))
	if (expected < 1).any():
		warnings.warn('{} cells have expected count < 1'.format((expected < 1).sum()))
	
	if ct.shape == (2,2):
		# 2x2 table
		# Use statsmodels to get OR and RR
		
		smct = sm.stats.Table2x2(np.flip(ct.to_numpy()), shift_zeros=False)
		result = smct.test_nominal_association()
		ORci = smct.oddsratio_confint()
		RRci = smct.riskratio_confint()
		
		return PearsonChiSquaredResult(
			ct=ct, statistic=result.statistic, dof=result.df, pvalue=result.pvalue,
			oddsratio=Estimate(smct.oddsratio, ORci[0], ORci[1]), riskratio=Estimate(smct.riskratio, RRci[0], RRci[1]))
	else:
		# rxc table
		# Just use SciPy
		
		result = stats.chi2_contingency(ct, correction=False)
		
		return PearsonChiSquaredResult(ct=ct, statistic=result[0], dof=result[2], pvalue=result[1])

# -------------------
# Pearson correlation

class PearsonRResult:
	"""Result of Pearson correlation"""
	
	def __init__(self, statistic, pvalue):
		self.statistic = statistic
		self.pvalue = pvalue
	
	def _repr_html_(self):
		return '<i>r</i> (95% CI) = {}; <i>p</i> {}'.format(self.statistic.summary(), fmt_p(self.pvalue, html=True))
	
	def summary(self):
		return 'r (95% CI) = {}; p {}'.format(self.statistic.summary(), fmt_p(self.pvalue, html=False))

def pearsonr(df, dep, ind, *, nan_policy='warn'):
	"""Compute the Pearson correlation coefficient (Pearson's r)"""
	
	# Check for/clean NaNs
	df = check_nan(df[[ind, dep]], nan_policy)
	
	# Compute Pearson's r
	result = stats.pearsonr(df[ind], df[dep])
	ci = result.confidence_interval()
	
	return PearsonRResult(statistic=Estimate(result.statistic, ci.low, ci.high), pvalue=result.pvalue)
Implement ttest_ind 2022-10-11 22:52:42 +11:00			`# scipy-yli: Helpful SciPy utilities and recipes`
			`# Copyright © 2022 Lee Yingtong Li (RunasSudo)`
			`#`
			`# This program is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU Affero General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU Affero General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU Affero General Public License`
			`# along with this program. If not, see <https://www.gnu.org/licenses/>.`

Implement chi2 2022-10-13 13:25:24 +11:00			`import numpy as np`
Implement ttest_ind 2022-10-11 22:52:42 +11:00			`import pandas as pd`
			`from scipy import stats`
			`import statsmodels.api as sm`

			`import functools`
			`import warnings`

Refactor fmt_p 2022-10-14 14:48:26 +11:00			`from .utils import Estimate, as_2groups, check_nan, fmt_p`
Implement ttest_ind 2022-10-11 22:52:42 +11:00
Implement ttest_ind 2022-10-13 12:53:18 +11:00			`# ----------------`
			`# Student's t test`
Implement ttest_ind 2022-10-11 22:52:42 +11:00
			`class TTestResult:`
			`"""`
			`Result of a Student's t test`

			`delta: Mean difference`
			`"""`

Implement ttest_ind 2022-10-13 12:53:18 +11:00			`def __init__(self, statistic, dof, pvalue, delta, delta_direction):`
Implement ttest_ind 2022-10-11 22:52:42 +11:00			`self.statistic = statistic`
			`self.dof = dof`
			`self.pvalue = pvalue`
			`self.delta = delta`
Implement ttest_ind 2022-10-13 12:53:18 +11:00			`self.delta_direction = delta_direction`
Implement ttest_ind 2022-10-11 22:52:42 +11:00
			`def _repr_html_(self):`
Refactor fmt_p 2022-10-14 14:48:26 +11:00			`return '<i>t</i>({:.0f}) = {:.2f}; <i>p</i> {}<br><i>δ</i> (95% CI) = {}, {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, html=True), self.delta.summary(), self.delta_direction)`
Implement ttest_ind 2022-10-11 22:52:42 +11:00
			`def summary(self):`
Refactor fmt_p 2022-10-14 14:48:26 +11:00			`return 't({:.0f}) = {:.2f}; p {}\nδ (95% CI) = {}, {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, html=False), self.delta.summary(), self.delta_direction)`
Implement ttest_ind 2022-10-11 22:52:42 +11:00
			`def ttest_ind(df, dep, ind, *, nan_policy='warn'):`
			`"""Perform an independent-sample Student's t test"""`

Implement ttest_ind 2022-10-13 12:53:18 +11:00			`# Check for/clean NaNs`
Implement ttest_ind 2022-10-11 22:52:42 +11:00			`df = check_nan(df[[ind, dep]], nan_policy)`

Implement ttest_ind 2022-10-13 12:53:18 +11:00			`# Ensure 2 groups for ind`
			`group1, data1, group2, data2 = as_2groups(df, dep, ind)`
Implement ttest_ind 2022-10-11 22:52:42 +11:00
			`# Do t test`
			`# Use statsmodels rather than SciPy because this provides the mean difference automatically`
Implement ttest_ind 2022-10-13 12:53:18 +11:00			`d1 = sm.stats.DescrStatsW(data1)`
			`d2 = sm.stats.DescrStatsW(data2)`
Implement ttest_ind 2022-10-11 22:52:42 +11:00
Implement ttest_ind 2022-10-13 12:53:18 +11:00			`cm = sm.stats.CompareMeans(d1, d2)`
Implement ttest_ind 2022-10-11 22:52:42 +11:00			`statistic, pvalue, dof = cm.ttest_ind()`

Implement ttest_ind 2022-10-13 12:53:18 +11:00			`delta = d1.mean - d2.mean`
Implement ttest_ind 2022-10-11 22:52:42 +11:00			`ci0, ci1 = cm.tconfint_diff()`

Implement ttest_ind 2022-10-13 12:53:18 +11:00			`# t test is symmetric so take absolute values`
			`return TTestResult(`
			`statistic=abs(statistic), dof=dof, pvalue=pvalue,`
			`delta=abs(Estimate(delta, ci0, ci1)),`
			`delta_direction=('{0} > {1}' if d1.mean > d2.mean else '{1} > {0}').format(group1, group2))`
Implement mannwhitney 2022-10-13 13:14:51 +11:00
Implement anova_oneway 2022-10-14 20:18:25 +11:00			`# -------------`
			`# One-way ANOVA`

			`class FTestResult:`
			`"""Result of an F test for ANOVA/regression"""`

			`def __init__(self, statistic, dof1, dof2, pvalue):`
			`self.statistic = statistic`
			`self.dof1 = dof1`
			`self.dof2 = dof2`
			`self.pvalue = pvalue`

			`def _repr_html_(self):`
			`return '<i>F</i>({}, {}) = {:.2f}; <i>p</i> {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=True))`

			`def summary(self):`
			`return 'F({}, {}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=False))`

			`def anova_oneway(df, dep, ind, *, nan_policy='omit'):`
			`"""Perform one-way ANOVA"""`

			`# Check for/clean NaNs`
			`df = check_nan(df[[ind, dep]], nan_policy)`

			`# Group by independent variable`
			`groups = df.groupby(ind)[dep]`

			`# Perform one-way ANOVA`
			`result = stats.f_oneway(*[groups.get_group(k) for k in groups.groups])`

			`# See stats.f_oneway implementation`
			`dfbn = len(groups.groups) - 1`
			`dfwn = len(df) - len(groups.groups)`

			`return FTestResult(result.statistic, dfbn, dfwn, result.pvalue)`

Implement mannwhitney 2022-10-13 13:14:51 +11:00			`# -----------------`
			`# Mann-Whitney test`

			`class MannWhitneyResult:`
			`"""`
			`Result of a Mann-Whitney test`

			`brunnermunzel: BrunnerMunzelResult on same data`
			`"""`

			`def __init__(self, statistic, pvalue, rank_biserial, direction, brunnermunzel=None):`
			`self.statistic = statistic`
			`self.pvalue = pvalue`
			`self.rank_biserial = rank_biserial`
			`self.direction = direction`
			`self.brunnermunzel = brunnermunzel`

			`def _repr_html_(self):`
Refactor fmt_p 2022-10-14 14:48:26 +11:00			`line1 = '<i>U</i> = {:.1f}; <i>p</i> {}<br><i>r</i> = {:.2f}, {}'.format(self.statistic, fmt_p(self.pvalue, html=True), self.rank_biserial, self.direction)`
Implement mannwhitney 2022-10-13 13:14:51 +11:00			`if self.brunnermunzel:`
			`return line1 + '<br>' + self.brunnermunzel._repr_html_()`
			`else:`
			`return line1`

			`def summary(self):`
Refactor fmt_p 2022-10-14 14:48:26 +11:00			`line1 = 'U = {:.1f}; p {}\nr = {}, {}'.format(self.statistic, fmt_p(self.pvalue, html=False), self.rank_biserial, self.direction)`
Implement mannwhitney 2022-10-13 13:14:51 +11:00			`if self.brunnermunzel:`
			`return line1 + '\n' + self.brunnermunzel.summary()`
			`else:`
			`return line1`

			`class BrunnerMunzelResult:`
			`"""Result of a Brunner-Munzel test"""`

			`def __init__(self, statistic, pvalue):`
			`self.statistic = statistic`
			`self.pvalue = pvalue`

			`def _repr_html_(self):`
Refactor fmt_p 2022-10-14 14:48:26 +11:00			`return '<i>W</i> = {:.1f}; <i>p</i> {}'.format(self.statistic, fmt_p(self.pvalue, html=True))`
Implement mannwhitney 2022-10-13 13:14:51 +11:00
			`def summary(self):`
Refactor fmt_p 2022-10-14 14:48:26 +11:00			`return 'W = {:.1f}; p {}'.format(self.statistic, fmt_p(self.pvalue, html=False))`
Implement mannwhitney 2022-10-13 13:14:51 +11:00
			`def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_continuity=False, alternative='two-sided', method='auto'):`
			`"""`
			`Perform a Mann-Whitney test`

			`brunnermunzel: Set to False to skip the Brunner-Munzel test`
			`use_continuity, alternative, method: See scipy.stats.mannwhitneyu`
			`"""`

			`# Check for/clean NaNs`
			`df = check_nan(df[[ind, dep]], nan_policy)`

			`# Ensure 2 groups for ind`
			`group1, data1, group2, data2 = as_2groups(df, dep, ind)`

			`# Do Mann-Whitney test`
			`# Stata does not perform continuity correction`
			`result = stats.mannwhitneyu(data1, data2, use_continuity=use_continuity, alternative=alternative, method=method)`

			`u1 = result.statistic`
			`u2 = len(data1) * len(data2) - u1`
			`r = abs(2u1 / (len(data1) len(data2)) - 1) # rank-biserial`

			`# If significant, perform a Brunner-Munzel test for our interest`
			`if result.pvalue < 0.05 and brunnermunzel:`
			`result_bm = stats.brunnermunzel(data1, data2)`

			`if result_bm.pvalue >= 0.05:`
			`warnings.warn('Mann-Whitney test is significant but Brunner-Munzel test is not. This could be due to a difference in shape, rather than location.')`

			`return MannWhitneyResult(`
			`statistic=min(u1, u2), pvalue=result.pvalue,`
			`#med1=data1.median(), med2=data2.median(),`
			`rank_biserial=r, direction=('{1} > {0}' if u1 < u2 else '{0} > {1}').format(group1, group2),`
			`brunnermunzel=BrunnerMunzelResult(statistic=result_bm.statistic, pvalue=result_bm.pvalue))`

			`return MannWhitneyResult(`
			`statistic=min(u1, u2), pvalue=result.pvalue,`
			`#med1=data1.median(), med2=data2.median(),`
			`rank_biserial=r, direction=('{1} > {0}' if u1 < u2 else '{0} > {1}').format(group1, group2))`
Implement chi2 2022-10-13 13:25:24 +11:00
			`# ------------------------`
			`# Pearson chi-squared test`

			`class PearsonChiSquaredResult:`
			`"""Result of a Pearson chi-squared test"""`

			`def __init__(self, ct, statistic, dof, pvalue, oddsratio=None, riskratio=None):`
			`self.ct = ct`
			`self.statistic = statistic`
			`self.dof = dof`
			`self.pvalue = pvalue`
			`self.oddsratio = oddsratio`
			`self.riskratio = riskratio`

			`def _repr_html_(self):`
			`if self.oddsratio is not None:`
			`return '{}<br><i>χ</i><sup>2</sup>({}) = {:.2f}; <i>p</i> {}<br>OR (95% CI) = {}<br>RR (95% CI) = {}'.format(`
Refactor fmt_p 2022-10-14 14:48:26 +11:00			`self.ct._repr_html_(), self.dof, self.statistic, fmt_p(self.pvalue, html=True), self.oddsratio.summary(), self.riskratio.summary())`
Implement chi2 2022-10-13 13:25:24 +11:00			`else:`
			`return '{}<br><i>χ</i><sup>2</sup>({}) = {:.2f}; <i>p</i> {}'.format(`
Refactor fmt_p 2022-10-14 14:48:26 +11:00			`self.ct._repr_html_(), self.dof, self.statistic, fmt_p(self.pvalue, html=True))`
Implement chi2 2022-10-13 13:25:24 +11:00
			`def summary(self):`
			`if self.oddsratio is not None:`
			`return '{}\nχ²({}) = {:.2f}; p {}\nOR (95% CI) = {}\nRR (95% CI) = {}'.format(`
Refactor fmt_p 2022-10-14 14:48:26 +11:00			`self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False), self.oddsratio.summary(), self.riskratio.summary())`
Implement chi2 2022-10-13 13:25:24 +11:00			`else:`
			`return '{}\nχ²({}) = {:.2f}; p {}'.format(`
Refactor fmt_p 2022-10-14 14:48:26 +11:00			`self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False))`
Implement chi2 2022-10-13 13:25:24 +11:00
			`def chi2(df, dep, ind, *, nan_policy='warn'):`
Style fixups 2022-10-14 20:08:01 +11:00			`"""Perform a Pearson chi-squared test"""`
Implement chi2 2022-10-13 13:25:24 +11:00
			`# Check for/clean NaNs`
			`df = check_nan(df[[ind, dep]], nan_policy)`

			`# Compute contingency table`
			`ct = pd.crosstab(df[ind], df[dep])`

			`# Get expected counts`
			`expected = stats.contingency.expected_freq(ct)`

			`# Warn on low expected counts`
			`if (expected < 5).sum() / expected.size > 0.2:`
			`warnings.warn('{} of {} cells ({:.0f}%) have expected count < 5'.format((expected < 5).sum(), expected.size, (expected < 5).sum() / expected.size * 100))`
			`if (expected < 1).any():`
			`warnings.warn('{} cells have expected count < 1'.format((expected < 1).sum()))`

			`if ct.shape == (2,2):`
			`# 2x2 table`
Style fixups 2022-10-14 20:08:01 +11:00			`# Use statsmodels to get OR and RR`
Implement chi2 2022-10-13 13:25:24 +11:00
			`smct = sm.stats.Table2x2(np.flip(ct.to_numpy()), shift_zeros=False)`
			`result = smct.test_nominal_association()`
			`ORci = smct.oddsratio_confint()`
			`RRci = smct.riskratio_confint()`

			`return PearsonChiSquaredResult(`
			`ct=ct, statistic=result.statistic, dof=result.df, pvalue=result.pvalue,`
			`oddsratio=Estimate(smct.oddsratio, ORci[0], ORci[1]), riskratio=Estimate(smct.riskratio, RRci[0], RRci[1]))`
			`else:`
			`# rxc table`
			`# Just use SciPy`

			`result = stats.chi2_contingency(ct, correction=False)`

			`return PearsonChiSquaredResult(ct=ct, statistic=result[0], dof=result[2], pvalue=result[1])`
Implement anova_oneway 2022-10-14 20:18:25 +11:00
			`# -------------------`
			`# Pearson correlation`

			`class PearsonRResult:`
			`"""Result of Pearson correlation"""`

			`def __init__(self, statistic, pvalue):`
			`self.statistic = statistic`
			`self.pvalue = pvalue`

			`def _repr_html_(self):`
			`return '<i>r</i> (95% CI) = {}; <i>p</i> {}'.format(self.statistic.summary(), fmt_p(self.pvalue, html=True))`

			`def summary(self):`
			`return 'r (95% CI) = {}; p {}'.format(self.statistic.summary(), fmt_p(self.pvalue, html=False))`

			`def pearsonr(df, dep, ind, *, nan_policy='warn'):`
			`"""Compute the Pearson correlation coefficient (Pearson's r)"""`

			`# Check for/clean NaNs`
			`df = check_nan(df[[ind, dep]], nan_policy)`

			`# Compute Pearson's r`
			`result = stats.pearsonr(df[ind], df[dep])`
			`ci = result.confidence_interval()`

			`return PearsonRResult(statistic=Estimate(result.statistic, ci.low, ci.high), pvalue=result.pvalue)`