# scipy-yli: Helpful SciPy utilities and recipes # Copyright © 2022 Lee Yingtong Li (RunasSudo) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import pandas as pd from scipy import stats import statsmodels.api as sm import functools import warnings def check_nan(df, nan_policy): """Check df against nan_policy and return cleaned input""" if nan_policy == 'raise': if pd.isna(df).any(axis=None): raise ValueError('NaN in input, pass nan_policy="warn" or "omit" to ignore') elif nan_policy == 'warn': df_cleaned = df.dropna() if len(df_cleaned) < len(df): warnings.warn('Omitting {} rows with NaN'.format(len(df) - len(df_cleaned))) return df_cleaned elif nan_policy == 'omit': return df.dropna() else: raise Exception('Invalid nan_policy, expected "raise", "warn" or "omit"') def do_fmt_p(p): """Return sign and formatted p value""" if p < 0.001: return '<', '0.001*' elif p < 0.0095: return None, '{:.3f}*'.format(p) elif p < 0.045: return None, '{:.2f}*'.format(p) elif p < 0.05: return None, '{:.3f}*'.format(p) # 3dps to show significance elif p < 0.055: return None, '{:.3f}'.format(p) # 3dps to show non-significance elif p < 0.095: return None, '{:.2f}'.format(p) else: return None, '{:.1f}'.format(p) def fmt_p_text(p, nospace=False): """Format p value for plaintext""" sign, fmt = do_fmt_p(p) if sign is not None: if nospace: return sign + fmt # e.g. "<0.001" else: return sign + ' ' + fmt # e.g. "< 0.001" else: if nospace: return fmt # e.g. "0.05" else: return '= ' + fmt # e.g. "= 0.05" def fmt_p_html(p, nospace=False): """Format p value for HTML""" txt = fmt_p_text(p, nospace) return txt.replace('<', '<') class Estimate: """A point estimate and surrounding confidence interval""" def __init__(self, point, ci_lower, ci_upper): self.point = point self.ci_lower = ci_lower self.ci_upper = ci_upper def _repr_html_(self): return self.summary() def summary(self): return '{:.2f} ({:.2f}–{:.2f})'.format(self.point, self.ci_lower, self.ci_upper) class TTestResult: """ Result of a Student's t test delta: Mean difference """ def __init__(self, statistic, dof, pvalue, delta): self.statistic = statistic self.dof = dof self.pvalue = pvalue self.delta = delta def _repr_html_(self): return 't({:.0f}) = {:.2f}; p {}
δ (95% CI) = {}'.format(self.dof, self.statistic, fmt_p_html(self.pvalue), self.delta.summary()) def summary(self): return 't({:.0f}) = {:.2f}; p {}\nδ (95% CI) = {}'.format(self.dof, self.statistic, fmt_p_text(self.pvalue), self.delta.summary()) def ttest_ind(df, dep, ind, *, nan_policy='warn'): """Perform an independent-sample Student's t test""" df = check_nan(df[[ind, dep]], nan_policy) # Get groupings for ind groups = list(df.groupby(ind).groups.values()) # Ensure only 2 groups to compare if len(groups) != 2: raise Exception('Got {} values for {}, expected 2'.format(len(groups), ind)) # Get 2 groups group1 = df.loc[groups[0], dep] group2 = df.loc[groups[1], dep] # Do t test # Use statsmodels rather than SciPy because this provides the mean difference automatically d1 = sm.stats.DescrStatsW(group1) d2 = sm.stats.DescrStatsW(group2) cm = sm.stats.CompareMeans(d2, d1) # This order to get correct CI statistic, pvalue, dof = cm.ttest_ind() delta = d2.mean - d1.mean ci0, ci1 = cm.tconfint_diff() return TTestResult(statistic=statistic, dof=dof, pvalue=pvalue, delta=Estimate(delta, ci0, ci1)) 0