scipy-yli/yli/sig_tests.py

140 lines
4.1 KiB
Python
Raw Normal View History

2022-10-11 22:52:42 +11:00
# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import pandas as pd
from scipy import stats
import statsmodels.api as sm
import functools
import warnings
def check_nan(df, nan_policy):
"""Check df against nan_policy and return cleaned input"""
if nan_policy == 'raise':
if pd.isna(df).any(axis=None):
raise ValueError('NaN in input, pass nan_policy="warn" or "omit" to ignore')
elif nan_policy == 'warn':
df_cleaned = df.dropna()
if len(df_cleaned) < len(df):
warnings.warn('Omitting {} rows with NaN'.format(len(df) - len(df_cleaned)))
return df_cleaned
elif nan_policy == 'omit':
return df.dropna()
else:
raise Exception('Invalid nan_policy, expected "raise", "warn" or "omit"')
def do_fmt_p(p):
"""Return sign and formatted p value"""
if p < 0.001:
return '<', '0.001*'
elif p < 0.0095:
return None, '{:.3f}*'.format(p)
elif p < 0.045:
return None, '{:.2f}*'.format(p)
elif p < 0.05:
return None, '{:.3f}*'.format(p) # 3dps to show significance
elif p < 0.055:
return None, '{:.3f}'.format(p) # 3dps to show non-significance
elif p < 0.095:
return None, '{:.2f}'.format(p)
else:
return None, '{:.1f}'.format(p)
def fmt_p_text(p, nospace=False):
"""Format p value for plaintext"""
sign, fmt = do_fmt_p(p)
if sign is not None:
if nospace:
return sign + fmt # e.g. "<0.001"
else:
return sign + ' ' + fmt # e.g. "< 0.001"
else:
if nospace:
return fmt # e.g. "0.05"
else:
return '= ' + fmt # e.g. "= 0.05"
def fmt_p_html(p, nospace=False):
"""Format p value for HTML"""
txt = fmt_p_text(p, nospace)
return txt.replace('<', '&lt;')
class Estimate:
"""A point estimate and surrounding confidence interval"""
def __init__(self, point, ci_lower, ci_upper):
self.point = point
self.ci_lower = ci_lower
self.ci_upper = ci_upper
def _repr_html_(self):
return self.summary()
def summary(self):
return '{:.2f} ({:.2f}{:.2f})'.format(self.point, self.ci_lower, self.ci_upper)
class TTestResult:
"""
Result of a Student's t test
delta: Mean difference
"""
def __init__(self, statistic, dof, pvalue, delta):
self.statistic = statistic
self.dof = dof
self.pvalue = pvalue
self.delta = delta
def _repr_html_(self):
return '<i>t</i>({:.0f}) = {:.2f}; <i>p</i> {}<br><i>δ</i> (95% CI) = {}'.format(self.dof, self.statistic, fmt_p_html(self.pvalue), self.delta.summary())
def summary(self):
return 't({:.0f}) = {:.2f}; p {}\nδ (95% CI) = {}'.format(self.dof, self.statistic, fmt_p_text(self.pvalue), self.delta.summary())
def ttest_ind(df, dep, ind, *, nan_policy='warn'):
"""Perform an independent-sample Student's t test"""
df = check_nan(df[[ind, dep]], nan_policy)
# Get groupings for ind
groups = list(df.groupby(ind).groups.values())
# Ensure only 2 groups to compare
if len(groups) != 2:
raise Exception('Got {} values for {}, expected 2'.format(len(groups), ind))
# Get 2 groups
group1 = df.loc[groups[0], dep]
group2 = df.loc[groups[1], dep]
# Do t test
# Use statsmodels rather than SciPy because this provides the mean difference automatically
d1 = sm.stats.DescrStatsW(group1)
d2 = sm.stats.DescrStatsW(group2)
cm = sm.stats.CompareMeans(d2, d1) # This order to get correct CI
statistic, pvalue, dof = cm.ttest_ind()
delta = d2.mean - d1.mean
ci0, ci1 = cm.tconfint_diff()
return TTestResult(statistic=statistic, dof=dof, pvalue=pvalue, delta=Estimate(delta, ci0, ci1))
0