Implement ttest_ind

2022-10-11 22:52:42 +11:00 · 2022-10-11 22:52:42 +11:00 · 97a01b4e80
commit 97a01b4e80
parent 582cfa5893
2 changed files with 142 additions and 1 deletions
--- a/yli/init.py
+++ b/yli/init.py
@ -14,7 +14,8 @@
 #   You should have received a copy of the GNU Affero General Public License
 #   along with this program.  If not, see <https://www.gnu.org/licenses/>.

-from .distributions import *
+from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
+from .sig_tests import ttest_ind

 def reload_me():
 	import importlib
--- a/yli/sig_tests.py
+++ b/yli/sig_tests.py
@ -0,0 +1,140 @@
+#   scipy-yli: Helpful SciPy utilities and recipes
+#   Copyright © 2022  Lee Yingtong Li (RunasSudo)
+#
+#   This program is free software: you can redistribute it and/or modify
+#   it under the terms of the GNU Affero General Public License as published by
+#   the Free Software Foundation, either version 3 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU Affero General Public License for more details.
+#
+#   You should have received a copy of the GNU Affero General Public License
+#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import pandas as pd
+from scipy import stats
+import statsmodels.api as sm
+
+import functools
+import warnings
+
+def check_nan(df, nan_policy):
+	"""Check df against nan_policy and return cleaned input"""
+	
+	if nan_policy == 'raise':
+		if pd.isna(df).any(axis=None):
+			raise ValueError('NaN in input, pass nan_policy="warn" or "omit" to ignore')
+	elif nan_policy == 'warn':
+		df_cleaned = df.dropna()
+		if len(df_cleaned) < len(df):
+			warnings.warn('Omitting {} rows with NaN'.format(len(df) - len(df_cleaned)))
+		return df_cleaned
+	elif nan_policy == 'omit':
+		return df.dropna()
+	else:
+		raise Exception('Invalid nan_policy, expected "raise", "warn" or "omit"')
+
+def do_fmt_p(p):
+	"""Return sign and formatted p value"""
+	
+	if p < 0.001:
+		return '<', '0.001*'
+	elif p < 0.0095:
+		return None, '{:.3f}*'.format(p)
+	elif p < 0.045:
+		return None, '{:.2f}*'.format(p)
+	elif p < 0.05:
+		return None, '{:.3f}*'.format(p)  # 3dps to show significance
+	elif p < 0.055:
+		return None, '{:.3f}'.format(p)   # 3dps to show non-significance
+	elif p < 0.095:
+		return None, '{:.2f}'.format(p)
+	else:
+		return None, '{:.1f}'.format(p)
+
+def fmt_p_text(p, nospace=False):
+	"""Format p value for plaintext"""
+	
+	sign, fmt = do_fmt_p(p)
+	if sign is not None:
+		if nospace:
+			return sign + fmt  # e.g. "<0.001"
+		else:
+			return sign + ' ' + fmt  # e.g. "< 0.001"
+	else:
+		if nospace:
+			return fmt  # e.g. "0.05"
+		else:
+			return '= ' + fmt  # e.g. "= 0.05"
+
+def fmt_p_html(p, nospace=False):
+	"""Format p value for HTML"""
+	
+	txt = fmt_p_text(p, nospace)
+	return txt.replace('<', '&lt;')
+
+class Estimate:
+	"""A point estimate and surrounding confidence interval"""
+	
+	def __init__(self, point, ci_lower, ci_upper):
+		self.point = point
+		self.ci_lower = ci_lower
+		self.ci_upper = ci_upper
+	
+	def _repr_html_(self):
+		return self.summary()
+	
+	def summary(self):
+		return '{:.2f} ({:.2f}–{:.2f})'.format(self.point, self.ci_lower, self.ci_upper)
+
+class TTestResult:
+	"""
+	Result of a Student's t test
+	
+	delta: Mean difference
+	"""
+	
+	def __init__(self, statistic, dof, pvalue, delta):
+		self.statistic = statistic
+		self.dof = dof
+		self.pvalue = pvalue
+		self.delta = delta
+	
+	def _repr_html_(self):
+		return '<i>t</i>({:.0f}) = {:.2f}; <i>p</i> {}<br><i>δ</i> (95% CI) = {}'.format(self.dof, self.statistic, fmt_p_html(self.pvalue), self.delta.summary())
+	
+	def summary(self):
+		return 't({:.0f}) = {:.2f}; p {}\nδ (95% CI) = {}'.format(self.dof, self.statistic, fmt_p_text(self.pvalue), self.delta.summary())
+
+def ttest_ind(df, dep, ind, *, nan_policy='warn'):
+	"""Perform an independent-sample Student's t test"""
+	
+	df = check_nan(df[[ind, dep]], nan_policy)
+	
+	# Get groupings for ind
+	groups = list(df.groupby(ind).groups.values())
+	
+	# Ensure only 2 groups to compare
+	if len(groups) != 2:
+		raise Exception('Got {} values for {}, expected 2'.format(len(groups), ind))
+	
+	# Get 2 groups
+	group1 = df.loc[groups[0], dep]
+	group2 = df.loc[groups[1], dep]
+	
+	# Do t test
+	# Use statsmodels rather than SciPy because this provides the mean difference automatically
+	d1 = sm.stats.DescrStatsW(group1)
+	d2 = sm.stats.DescrStatsW(group2)
+	
+	cm = sm.stats.CompareMeans(d2, d1)  # This order to get correct CI
+	statistic, pvalue, dof = cm.ttest_ind()
+	
+	delta = d2.mean - d1.mean
+	ci0, ci1 = cm.tconfint_diff()
+	
+	return TTestResult(statistic=statistic, dof=dof, pvalue=pvalue, delta=Estimate(delta, ci0, ci1))
+0