Implement ttest_ind

2022-10-13 12:53:18 +11:00 · 2022-10-13 12:53:18 +11:00 · 6b43034a50
commit 6b43034a50
parent 97a01b4e80
3 changed files with 178 additions and 86 deletions
--- a/tests/test_ttest.py
+++ b/tests/test_ttest.py
@ -0,0 +1,40 @@
 #   scipy-yli: Helpful SciPy utilities and recipes
 #   Copyright © 2022  Lee Yingtong Li (RunasSudo)
 #
 #   This program is free software: you can redistribute it and/or modify
 #   it under the terms of the GNU Affero General Public License as published by
 #   the Free Software Foundation, either version 3 of the License, or
 #   (at your option) any later version.
 #
 #   This program is distributed in the hope that it will be useful,
 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #   GNU Affero General Public License for more details.
 #
 #   You should have received a copy of the GNU Affero General Public License
 #   along with this program.  If not, see <https://www.gnu.org/licenses/>.
 from pytest import approx
 import numpy as np
 import pandas as pd
 import yli
 def test_ttest_ind_ol6_1():
 	"""Compare yli.ttest_ind for Ott & Longnecker (2016) example 6.1"""
 	df = pd.DataFrame({
 		'Type': ['Fresh'] * 10 + ['Stored'] * 10,
 		'Potency': [10.2, 10.5, 10.3, 10.8, 9.8, 10.6, 10.7, 10.2, 10.0, 10.6, 9.8, 9.6, 10.1, 10.2, 10.1, 9.7, 9.5, 9.6, 9.8, 9.9]
 	})
 	result = yli.ttest_ind(df, 'Potency', 'Type')
 	t_expected = 0.54/(0.285*np.sqrt(1/10+1/10))
 	assert result.statistic == approx(t_expected, abs=0.01)
 	assert result.dof == 18
 	assert result.delta.point == approx(0.54, abs=0.01)
 	assert result.delta.ci_lower == approx(0.272, abs=0.01)
 	assert result.delta.ci_upper == approx(0.808, abs=0.01)
--- a/yli/sig_tests.py
+++ b/yli/sig_tests.py
@ -21,74 +21,10 @@ import statsmodels.api as sm
 import functools
 import warnings
-def check_nan(df, nan_policy):
+from .utils import Estimate, as_2groups, check_nan, fmt_p_html, fmt_p_text
 	"""Check df against nan_policy and return cleaned input"""
 	if nan_policy == 'raise':
 		if pd.isna(df).any(axis=None):
 			raise ValueError('NaN in input, pass nan_policy="warn" or "omit" to ignore')
 	elif nan_policy == 'warn':
 		df_cleaned = df.dropna()
 		if len(df_cleaned) < len(df):
 			warnings.warn('Omitting {} rows with NaN'.format(len(df) - len(df_cleaned)))
 		return df_cleaned
 	elif nan_policy == 'omit':
 		return df.dropna()
 	else:
 		raise Exception('Invalid nan_policy, expected "raise", "warn" or "omit"')
-def do_fmt_p(p):
+# ----------------
-	"""Return sign and formatted p value"""
+# Student's t test
 	if p < 0.001:
 		return '<', '0.001*'
 	elif p < 0.0095:
 		return None, '{:.3f}*'.format(p)
 	elif p < 0.045:
 		return None, '{:.2f}*'.format(p)
 	elif p < 0.05:
 		return None, '{:.3f}*'.format(p)  # 3dps to show significance
 	elif p < 0.055:
 		return None, '{:.3f}'.format(p)   # 3dps to show non-significance
 	elif p < 0.095:
 		return None, '{:.2f}'.format(p)
 	else:
 		return None, '{:.1f}'.format(p)
 def fmt_p_text(p, nospace=False):
 	"""Format p value for plaintext"""
 	sign, fmt = do_fmt_p(p)
 	if sign is not None:
 		if nospace:
 			return sign + fmt  # e.g. "<0.001"
 		else:
 			return sign + ' ' + fmt  # e.g. "< 0.001"
 	else:
 		if nospace:
 			return fmt  # e.g. "0.05"
 		else:
 			return '= ' + fmt  # e.g. "= 0.05"
 def fmt_p_html(p, nospace=False):
 	"""Format p value for HTML"""
 	txt = fmt_p_text(p, nospace)
 	return txt.replace('<', '&lt;')
 class Estimate:
 	"""A point estimate and surrounding confidence interval"""
 	def __init__(self, point, ci_lower, ci_upper):
 		self.point = point
 		self.ci_lower = ci_lower
 		self.ci_upper = ci_upper
 	def _repr_html_(self):
 		return self.summary()
 	def summary(self):
 		return '{:.2f} ({:.2f}–{:.2f})'.format(self.point, self.ci_lower, self.ci_upper)
 class TTestResult:
 	"""
@ -97,44 +33,41 @@ class TTestResult:
 	delta: Mean difference
 	"""
-	def __init__(self, statistic, dof, pvalue, delta):
+	def __init__(self, statistic, dof, pvalue, delta, delta_direction):
 		self.statistic = statistic
 		self.dof = dof
 		self.pvalue = pvalue
 		self.delta = delta
 		self.delta_direction = delta_direction
 	def _repr_html_(self):
-		return '<i>t</i>({:.0f}) = {:.2f}; <i>p</i> {}<br><i>δ</i> (95% CI) = {}'.format(self.dof, self.statistic, fmt_p_html(self.pvalue), self.delta.summary())
+		return '<i>t</i>({:.0f}) = {:.2f}; <i>p</i> {}<br><i>δ</i> (95% CI) = {}, {}'.format(self.dof, self.statistic, fmt_p_html(self.pvalue), self.delta.summary(), self.delta_direction)
 	def summary(self):
-		return 't({:.0f}) = {:.2f}; p {}\nδ (95% CI) = {}'.format(self.dof, self.statistic, fmt_p_text(self.pvalue), self.delta.summary())
+		return 't({:.0f}) = {:.2f}; p {}\nδ (95% CI) = {}, {}'.format(self.dof, self.statistic, fmt_p_text(self.pvalue), self.delta.summary(), self.delta_direction)
 def ttest_ind(df, dep, ind, *, nan_policy='warn'):
 	"""Perform an independent-sample Student's t test"""
 	# Check for/clean NaNs
 	df = check_nan(df[[ind, dep]], nan_policy)
-	# Get groupings for ind
+	# Ensure 2 groups for ind
-	groups = list(df.groupby(ind).groups.values())
+	group1, data1, group2, data2 = as_2groups(df, dep, ind)
 	# Ensure only 2 groups to compare
 	if len(groups) != 2:
 		raise Exception('Got {} values for {}, expected 2'.format(len(groups), ind))
 	# Get 2 groups
 	group1 = df.loc[groups[0], dep]
 	group2 = df.loc[groups[1], dep]
 	# Do t test
 	# Use statsmodels rather than SciPy because this provides the mean difference automatically
-	d1 = sm.stats.DescrStatsW(group1)
+	d1 = sm.stats.DescrStatsW(data1)
-	d2 = sm.stats.DescrStatsW(group2)
+	d2 = sm.stats.DescrStatsW(data2)
-	cm = sm.stats.CompareMeans(d2, d1)  # This order to get correct CI
+	cm = sm.stats.CompareMeans(d1, d2)
 	statistic, pvalue, dof = cm.ttest_ind()
-	delta = d2.mean - d1.mean
+	delta = d1.mean - d2.mean
 	ci0, ci1 = cm.tconfint_diff()
-	return TTestResult(statistic=statistic, dof=dof, pvalue=pvalue, delta=Estimate(delta, ci0, ci1))
+	# t test is symmetric so take absolute values
-0
+	return TTestResult(
 		statistic=abs(statistic), dof=dof, pvalue=pvalue,
 		delta=abs(Estimate(delta, ci0, ci1)),
 		delta_direction=('{0} > {1}' if d1.mean > d2.mean else '{1} > {0}').format(group1, group2))
--- a/yli/utils.py
+++ b/yli/utils.py
@ -0,0 +1,119 @@
 #   scipy-yli: Helpful SciPy utilities and recipes
 #   Copyright © 2022  Lee Yingtong Li (RunasSudo)
 #
 #   This program is free software: you can redistribute it and/or modify
 #   it under the terms of the GNU Affero General Public License as published by
 #   the Free Software Foundation, either version 3 of the License, or
 #   (at your option) any later version.
 #
 #   This program is distributed in the hope that it will be useful,
 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #   GNU Affero General Public License for more details.
 #
 #   You should have received a copy of the GNU Affero General Public License
 #   along with this program.  If not, see <https://www.gnu.org/licenses/>.
 import numpy as np
 import pandas as pd
 import warnings
 def check_nan(df, nan_policy):
 	"""Check df against nan_policy and return cleaned input"""
 	if nan_policy == 'raise':
 		if pd.isna(df).any(axis=None):
 			raise ValueError('NaN in input, pass nan_policy="warn" or "omit" to ignore')
 	elif nan_policy == 'warn':
 		df_cleaned = df.dropna()
 		if len(df_cleaned) < len(df):
 			warnings.warn('Omitting {} rows with NaN'.format(len(df) - len(df_cleaned)))
 		return df_cleaned
 	elif nan_policy == 'omit':
 		return df.dropna()
 	else:
 		raise Exception('Invalid nan_policy, expected "raise", "warn" or "omit"')
 def as_2groups(df, data, group):
 	"""Group the data by the given variable, ensuring only 2 groups"""
 	# Get groupings
 	groups = list(df.groupby(group).groups.items())
 	# Ensure only 2 groups to compare
 	if len(groups) != 2:
 		raise Exception('Got {} values for {}, expected 2'.format(len(groups), group))
 	# Get 2 groups
 	group1 = groups[0][0]
 	data1 = df.loc[groups[0][1], data]
 	group2 = groups[1][0]
 	data2 = df.loc[groups[1][1], data]
 	return group1, data1, group2, data2
 def do_fmt_p(p):
 	"""Return sign and formatted p value"""
 	if p < 0.001:
 		return '<', '0.001*'
 	elif p < 0.0095:
 		return None, '{:.3f}*'.format(p)
 	elif p < 0.045:
 		return None, '{:.2f}*'.format(p)
 	elif p < 0.05:
 		return None, '{:.3f}*'.format(p)  # 3dps to show significance
 	elif p < 0.055:
 		return None, '{:.3f}'.format(p)   # 3dps to show non-significance
 	elif p < 0.095:
 		return None, '{:.2f}'.format(p)
 	else:
 		return None, '{:.1f}'.format(p)
 def fmt_p_text(p, nospace=False):
 	"""Format p value for plaintext"""
 	sign, fmt = do_fmt_p(p)
 	if sign is not None:
 		if nospace:
 			return sign + fmt  # e.g. "<0.001"
 		else:
 			return sign + ' ' + fmt  # e.g. "< 0.001"
 	else:
 		if nospace:
 			return fmt  # e.g. "0.05"
 		else:
 			return '= ' + fmt  # e.g. "= 0.05"
 def fmt_p_html(p, nospace=False):
 	"""Format p value for HTML"""
 	txt = fmt_p_text(p, nospace)
 	return txt.replace('<', '&lt;')
 class Estimate:
 	"""A point estimate and surrounding confidence interval"""
 	def __init__(self, point, ci_lower, ci_upper):
 		self.point = point
 		self.ci_lower = ci_lower
 		self.ci_upper = ci_upper
 	def _repr_html_(self):
 		return self.summary()
 	def summary(self):
 		return '{:.2f} ({:.2f}–{:.2f})'.format(self.point, self.ci_lower, self.ci_upper)
 	def __neg__(self):
 		return Estimate(-self.point, -self.ci_upper, -self.ci_lower)
 	def __abs__(self):
 		if self.point < 0:
 			return -self
 		else:
 			return self
 	def exp(self):
 		return Estimate(np.exp(self.point), np.exp(self.ci_lower), np.exp(self.ci_upper))