# scipy-yli: Helpful SciPy utilities and recipes # Copyright © 2022 Lee Yingtong Li (RunasSudo) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import numpy as np import pandas as pd from scipy import stats import statsmodels.api as sm import functools import warnings from .config import config from .utils import Estimate, as_2groups, check_nan, fmt_p # ---------------- # Student's t test class TTestResult: """ Result of a Student's t test delta: Mean difference """ def __init__(self, statistic, dof, pvalue, delta, delta_direction): self.statistic = statistic self.dof = dof self.pvalue = pvalue self.delta = delta self.delta_direction = delta_direction def _repr_html_(self): return 't({:.0f}) = {:.2f}; p {}
Δμ ({:g}% CI) = {}, {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, html=True), (1-config.alpha)*100, self.delta.summary(), self.delta_direction) def summary(self): return 't({:.0f}) = {:.2f}; p {}\nΔμ ({:g}% CI) = {}, {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, html=False), (1-config.alpha)*100, self.delta.summary(), self.delta_direction) def ttest_ind(df, dep, ind, *, nan_policy='warn'): """Perform an independent-sample Student's t test""" # Check for/clean NaNs df = check_nan(df[[ind, dep]], nan_policy) # Ensure 2 groups for ind group1, data1, group2, data2 = as_2groups(df, dep, ind) # Do t test # Use statsmodels rather than SciPy because this provides the mean difference automatically d1 = sm.stats.DescrStatsW(data1) d2 = sm.stats.DescrStatsW(data2) cm = sm.stats.CompareMeans(d1, d2) statistic, pvalue, dof = cm.ttest_ind() delta = d1.mean - d2.mean ci0, ci1 = cm.tconfint_diff(config.alpha) # t test is symmetric so take absolute values return TTestResult( statistic=abs(statistic), dof=dof, pvalue=pvalue, delta=abs(Estimate(delta, ci0, ci1)), delta_direction=('{0} > {1}' if d1.mean > d2.mean else '{1} > {0}').format(group1, group2)) # ------------- # One-way ANOVA class FTestResult: """Result of an F test for ANOVA/regression""" def __init__(self, statistic, dof1, dof2, pvalue): self.statistic = statistic self.dof1 = dof1 self.dof2 = dof2 self.pvalue = pvalue def _repr_html_(self): return 'F({}, {}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=True)) def summary(self): return 'F({}, {}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=False)) def anova_oneway(df, dep, ind, *, nan_policy='omit'): """Perform one-way ANOVA""" # Check for/clean NaNs df = check_nan(df[[ind, dep]], nan_policy) # Group by independent variable groups = df.groupby(ind)[dep] # Perform one-way ANOVA result = stats.f_oneway(*[groups.get_group(k) for k in groups.groups]) # See stats.f_oneway implementation dfbn = len(groups.groups) - 1 dfwn = len(df) - len(groups.groups) return FTestResult(result.statistic, dfbn, dfwn, result.pvalue) # ----------------- # Mann-Whitney test class MannWhitneyResult: """ Result of a Mann-Whitney test brunnermunzel: BrunnerMunzelResult on same data """ def __init__(self, statistic, pvalue, rank_biserial, direction, brunnermunzel=None): self.statistic = statistic self.pvalue = pvalue self.rank_biserial = rank_biserial self.direction = direction self.brunnermunzel = brunnermunzel def _repr_html_(self): line1 = 'U = {:.1f}; p {}
r = {:.2f}, {}'.format(self.statistic, fmt_p(self.pvalue, html=True), self.rank_biserial, self.direction) if self.brunnermunzel: return line1 + '
' + self.brunnermunzel._repr_html_() else: return line1 def summary(self): line1 = 'U = {:.1f}; p {}\nr = {}, {}'.format(self.statistic, fmt_p(self.pvalue, html=False), self.rank_biserial, self.direction) if self.brunnermunzel: return line1 + '\n' + self.brunnermunzel.summary() else: return line1 class BrunnerMunzelResult: """Result of a Brunner-Munzel test""" def __init__(self, statistic, pvalue): self.statistic = statistic self.pvalue = pvalue def _repr_html_(self): return 'W = {:.1f}; p {}'.format(self.statistic, fmt_p(self.pvalue, html=True)) def summary(self): return 'W = {:.1f}; p {}'.format(self.statistic, fmt_p(self.pvalue, html=False)) def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_continuity=False, alternative='two-sided', method='auto'): """ Perform a Mann-Whitney test brunnermunzel: Set to False to skip the Brunner-Munzel test use_continuity, alternative, method: See scipy.stats.mannwhitneyu """ # Check for/clean NaNs df = check_nan(df[[ind, dep]], nan_policy) # Ensure 2 groups for ind group1, data1, group2, data2 = as_2groups(df, dep, ind) # Do Mann-Whitney test # Stata does not perform continuity correction result = stats.mannwhitneyu(data1, data2, use_continuity=use_continuity, alternative=alternative, method=method) u1 = result.statistic u2 = len(data1) * len(data2) - u1 r = abs(2*u1 / (len(data1) * len(data2)) - 1) # rank-biserial # If significant, perform a Brunner-Munzel test for our interest if result.pvalue < 0.05 and brunnermunzel: result_bm = stats.brunnermunzel(data1, data2) if result_bm.pvalue >= 0.05: warnings.warn('Mann-Whitney test is significant but Brunner-Munzel test is not. This could be due to a difference in shape, rather than location.') return MannWhitneyResult( statistic=min(u1, u2), pvalue=result.pvalue, #med1=data1.median(), med2=data2.median(), rank_biserial=r, direction=('{1} > {0}' if u1 < u2 else '{0} > {1}').format(group1, group2), brunnermunzel=BrunnerMunzelResult(statistic=result_bm.statistic, pvalue=result_bm.pvalue)) return MannWhitneyResult( statistic=min(u1, u2), pvalue=result.pvalue, #med1=data1.median(), med2=data2.median(), rank_biserial=r, direction=('{1} > {0}' if u1 < u2 else '{0} > {1}').format(group1, group2)) # ------------------------ # Pearson chi-squared test class PearsonChiSquaredResult: """Result of a Pearson chi-squared test""" def __init__(self, ct, statistic, dof, pvalue, oddsratio=None, riskratio=None): self.ct = ct self.statistic = statistic self.dof = dof self.pvalue = pvalue self.oddsratio = oddsratio self.riskratio = riskratio def _repr_html_(self): if self.oddsratio is not None: return '{0}
χ2({1}) = {2:.2f}; p {3}
OR ({4:g}% CI) = {5}
RR ({4:g}% CI) = {6}'.format( self.ct._repr_html_(), self.dof, self.statistic, fmt_p(self.pvalue, html=True), (1-config.alpha)*100, self.oddsratio.summary(), self.riskratio.summary()) else: return '{}
χ2({}) = {:.2f}; p {}'.format( self.ct._repr_html_(), self.dof, self.statistic, fmt_p(self.pvalue, html=True)) def summary(self): if self.oddsratio is not None: return '{0}\nχ²({1}) = {2:.2f}; p {3}\nOR ({4:g}% CI) = {5}\nRR ({4:g}% CI) = {6}'.format( self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False), (1-config.alpha)*100, self.oddsratio.summary(), self.riskratio.summary()) else: return '{}\nχ²({}) = {:.2f}; p {}'.format( self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False)) def chi2(df, dep, ind, *, nan_policy='warn'): """Perform a Pearson chi-squared test""" # Check for/clean NaNs df = check_nan(df[[ind, dep]], nan_policy) # Compute contingency table ct = pd.crosstab(df[ind], df[dep]) # Get expected counts expected = stats.contingency.expected_freq(ct) # Warn on low expected counts if (expected < 5).sum() / expected.size > 0.2: warnings.warn('{} of {} cells ({:.0f}%) have expected count < 5'.format((expected < 5).sum(), expected.size, (expected < 5).sum() / expected.size * 100)) if (expected < 1).any(): warnings.warn('{} cells have expected count < 1'.format((expected < 1).sum())) if ct.shape == (2,2): # 2x2 table # Use statsmodels to get OR and RR smct = sm.stats.Table2x2(np.flip(ct.to_numpy()), shift_zeros=False) result = smct.test_nominal_association() ORci = smct.oddsratio_confint(config.alpha) RRci = smct.riskratio_confint(config.alpha) return PearsonChiSquaredResult( ct=ct, statistic=result.statistic, dof=result.df, pvalue=result.pvalue, oddsratio=Estimate(smct.oddsratio, ORci[0], ORci[1]), riskratio=Estimate(smct.riskratio, RRci[0], RRci[1])) else: # rxc table # Just use SciPy result = stats.chi2_contingency(ct, correction=False) return PearsonChiSquaredResult(ct=ct, statistic=result[0], dof=result[2], pvalue=result[1]) # ------------------- # Pearson correlation class PearsonRResult: """Result of Pearson correlation""" def __init__(self, statistic, pvalue): self.statistic = statistic self.pvalue = pvalue def _repr_html_(self): return 'r ({:g}% CI) = {}; p {}'.format((1-config.alpha)*100, self.statistic.summary(), fmt_p(self.pvalue, html=True)) def summary(self): return 'r ({:g}% CI) = {}; p {}'.format((1-config.alpha)*100, self.statistic.summary(), fmt_p(self.pvalue, html=False)) def pearsonr(df, dep, ind, *, nan_policy='warn'): """Compute the Pearson correlation coefficient (Pearson's r)""" # Check for/clean NaNs df = check_nan(df[[ind, dep]], nan_policy) # Compute Pearson's r result = stats.pearsonr(df[ind], df[dep]) ci = result.confidence_interval() return PearsonRResult(statistic=Estimate(result.statistic, ci.low, ci.high), pvalue=result.pvalue)