# scipy-yli: Helpful SciPy utilities and recipes # Copyright © 2022 Lee Yingtong Li (RunasSudo) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import numpy as np import pandas as pd from scipy import stats import statsmodels.api as sm import functools import warnings from .config import config from .utils import Estimate, PValueStyle, as_2groups, check_nan, convert_pandas_nullable, fmt_p # ---------------- # Student's t test class TTestResult: """ Result of a Student's *t* test See :func:`yli.ttest_ind`. """ def __init__(self, statistic, dof, pvalue, group1, group2, mu1, mu2, sd1, sd2, delta, delta_direction): #: *t* statistic (*float*) self.statistic = statistic #: Degrees of freedom of the *t* distribution (*int*) self.dof = dof #: *p* value for the *t* statistic (*float*) self.pvalue = pvalue #: Name of the first group (*str*) self.group1 = group1 #: Name of the second group (*str*) self.group2 = group2 #: Mean of the first group (*float*) self.mu1 = mu1 #: Mean of the second group (*float*) self.mu2 = mu2 #: Standard deviation of the first group (*float*) self.sd1 = sd1 #: Standard deviation of the second group (*float*) self.sd2 = sd2 #: Absolute value of the mean difference (:class:`yli.utils.Estimate`) self.delta = delta #: Description of the direction of the effect (*str*) self.delta_direction = delta_direction def _comparison_table(self, html): """Return a table showing the means/SDs for each group""" table_data = { self.group1: '{:.2f} ({:.2f})'.format(self.mu1, self.sd1), self.group2: '{:.2f} ({:.2f})'.format(self.mu2, self.sd2), } if html: table = pd.DataFrame(table_data, index=['\ue000 (SD)']) # U+E000 is in Private Use Area, mark μ symbol table_str = table._repr_html_() return table_str.replace('\ue000', 'μ') else: table = pd.DataFrame(table_data, index=['μ (SD)']) return str(table) def __repr__(self): if config.repr_is_summary: return self.summary() return super().__repr__() def _repr_html_(self): return '{}
t({:.0f}) = {:.2f}; p {}
Δμ ({:g}% CI) = {}, {}'.format(self._comparison_table(True), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML), (1-config.alpha)*100, self.delta.summary(), self.delta_direction) def summary(self): """ Return a stringified summary of the *t* test :rtype: str """ return '{}\n\nt({:.0f}) = {:.2f}; p {}\nΔμ ({:g}% CI) = {}, {}'.format(self._comparison_table(False), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION), (1-config.alpha)*100, self.delta.summary(), self.delta_direction) def ttest_ind(df, dep, ind, *, nan_policy='warn'): """ Perform an independent 2-sample Student's *t* test :param df: Data to perform the test on :type df: DataFrame :param dep: Column in *df* for the dependent variable (numeric) :type dep: str :param ind: Column in *df* for the independent variable (dichotomous) :type ind: str :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) :type nan_policy: str :rtype: :class:`yli.sig_tests.TTestResult` **Example:** .. code-block:: df = pd.DataFrame({ 'Type': ['Fresh'] * 10 + ['Stored'] * 10, 'Potency': [10.2, 10.5, 10.3, ...] }) yli.ttest_ind(df, 'Potency', 'Type') .. code-block:: text Fresh Stored μ (SD) 10.37 (0.32) 9.83 (0.24) t(18) = 4.24; p < 0.001* Δμ (95% CI) = 0.54 (0.27–0.81), Fresh > Stored The output states that the value of the *t* statistic is 4.24, the *t* distribution has 18 degrees of freedom, and the test is significant with *p* value < 0.001. The mean difference is 0.54 in favour of the *Fresh* group, with 95% confidence interval 0.27–0.81. """ # Check for/clean NaNs df = check_nan(df[[ind, dep]], nan_policy) # Ensure 2 groups for ind group1, data1, group2, data2 = as_2groups(df, dep, ind) # Do t test # Use statsmodels rather than SciPy because this provides the mean difference automatically d1 = sm.stats.DescrStatsW(data1, ddof=1) d2 = sm.stats.DescrStatsW(data2, ddof=1) cm = sm.stats.CompareMeans(d1, d2) statistic, pvalue, dof = cm.ttest_ind() delta = d1.mean - d2.mean ci0, ci1 = cm.tconfint_diff(config.alpha) # t test is symmetric so take absolute value if d2.mean > d1.mean: delta, ci0, ci1 = -delta, -ci1, -ci0 d1, d2 = d2, d1 group1, group2 = group2, group1 # Now group1 > group2 return TTestResult( statistic=abs(statistic), dof=dof, pvalue=pvalue, group1=group1, group2=group2, mu1=d1.mean, mu2=d2.mean, sd1=d1.std, sd2=d2.std, delta=Estimate(delta, ci0, ci1), delta_direction='{} > {}'.format(group1, group2)) # ------------- # One-way ANOVA class FTestResult: """ Result of an *F* test for ANOVA/regression See :func:`yli.anova_oneway` and :meth:`yli.regress.RegressionResult.ftest`. """ def __init__(self, statistic, dof1, dof2, pvalue): #: *F* statistic (*float*) self.statistic = statistic #: Degrees of freedom in the *F* distribution numerator (*int*) self.dof1 = dof1 #: Degrees of freedom in the *F* distribution denominator (*int*) self.dof2 = dof2 #: *p* value for the *F* statistic (*float*) self.pvalue = pvalue def __repr__(self): if config.repr_is_summary: return self.summary() return super().__repr__() def _repr_html_(self): return 'F({:.0f}, {:.0f}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML)) def summary(self): """ Return a stringified summary of the *F* test :rtype: str """ return 'F({:.0f}, {:.0f}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION)) def anova_oneway(df, dep, ind, *, nan_policy='omit'): """ Perform one-way ANOVA :param df: Data to perform the test on :type df: DataFrame :param dep: Column in *df* for the dependent variable (numeric) :type dep: str :param ind: Column in *df* for the independent variable (categorical) :type ind: str :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) :type nan_policy: str :rtype: :class:`yli.sig_tests.FTestResult` **Example:** .. code-block:: df = pd.DataFrame({ 'Method': [1]*8 + [2]*7 + [3]*9, 'Score': [96, 79, 91, ...] }) yli.anova_oneway(df, 'Score', 'Method') .. code-block:: text F(2, 21) = 29.57; p < 0.001* The output states that the value of the *F* statistic is 29.57, the *F* distribution has 2 degrees of freedom in the numerator and 21 in the denominator, and the test is significant with *p* value < 0.001. """ # Check for/clean NaNs df = check_nan(df[[ind, dep]], nan_policy) # Group by independent variable groups = df.groupby(ind)[dep] # Perform one-way ANOVA result = stats.f_oneway(*[groups.get_group(k) for k in groups.groups]) # See stats.f_oneway implementation dfbn = len(groups.groups) - 1 dfwn = len(df) - len(groups.groups) return FTestResult(result.statistic, dfbn, dfwn, result.pvalue) # ----------------- # Mann-Whitney test class MannWhitneyResult: """ Result of a Mann-Whitney *U* test See :func:`yli.mannwhitney`. """ def __init__(self, statistic, pvalue, rank_biserial, direction, brunnermunzel=None): #: Lesser of the two Mann–Whitney *U* statistics (*float*) self.statistic = statistic #: *p* value for the *U* statistic (*float*) self.pvalue = pvalue #: Absolute value of the rank-biserial correlation (*float*) self.rank_biserial = rank_biserial #: Description of the direction of the effect (*str*) self.direction = direction #: :class:`BrunnerMunzelResult` on the same data, or *None* if N/A self.brunnermunzel = brunnermunzel def __repr__(self): if config.repr_is_summary: return self.summary() return super().__repr__() def _repr_html_(self): line1 = 'U = {:.1f}; p {}
r = {:.2f}, {}'.format(self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML), self.rank_biserial, self.direction) if self.brunnermunzel: return line1 + '
' + self.brunnermunzel._repr_html_() else: return line1 def summary(self): """ Return a stringified summary of the Mann–Whitney test :rtype: str """ line1 = 'U = {:.1f}; p {}\nr = {:.2f}, {}'.format(self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION), self.rank_biserial, self.direction) if self.brunnermunzel: return line1 + '\n' + self.brunnermunzel.summary() else: return line1 class BrunnerMunzelResult: """ Result of a Brunner–Munzel test See :func:`yli.mannwhitney`. This library calls the Brunner–Munzel test statistic *W*. """ """Result of a Brunner-Munzel test""" def __init__(self, statistic, pvalue): #: *W* statistic (*float*) self.statistic = statistic #: *p* value for the *W* statistic (*float*) self.pvalue = pvalue def __repr__(self): if config.repr_is_summary: return self.summary() return super().__repr__() def _repr_html_(self): return 'W = {:.1f}; p {}'.format(self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML)) def summary(self): """ Return a stringified summary of the Brunner–Munzel test :rtype: str """ return 'W = {:.1f}; p {}'.format(self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION)) def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_continuity=False, alternative='two-sided', method='auto'): """ Perform a Mann-Whitney *U* test By default, this function performs a Brunner–Munzel test if the Mann–Whitney test is significant. If the Mann–Whitney test is significant but the Brunner–Munzel test is not, a warning is raised. The Brunner–Munzel test is returned only if non-significant. :param df: Data to perform the test on :type df: DataFrame :param dep: Column in *df* for the dependent variable (numeric) :type dep: str :param ind: Column in *df* for the independent variable (dichotomous) :type ind: str :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) :type nan_policy: str :param brunnermunzel: Whether to compute the Brunner–Munzel test if the Mann–Whitney test is significant :type brunnermunzel: bool :param use_continuity: See *scipy.stats.mannwhitneyu* :param alternative: See *scipy.stats.mannwhitneyu* :param method: See *scipy.stats.mannwhitneyu* :return: The result of the Mann–Whitney test. The result of a Brunner–Munzel test is included in the result object if and only if *brunnermunzel* is *True*, *and* the Mann–Whitney test is significant, *and* the Brunner–Munzel test is non-significant. :rtype: :class:`yli.sig_tests.MannWhitneyResult` **Example:** .. code-block:: df = pd.DataFrame({ 'Sample': ['Before'] * 12 + ['After'] * 12, 'Oxygen': [11.0, 11.2, 11.2, ...] }) yli.mannwhitney(df, 'Oxygen', 'Sample', method='asymptotic', alternative='less') .. code-block:: text U = 6.0; p < 0.001* r = 0.92, Before > After The output states that the value of the Mann–Whitney *U* statistic is 6.0, and the one-sided test is significant with asymptotic *p* value < 0.001. The rank-biserial correlation is 0.92 in favour of the *Before* group. """ # Check for/clean NaNs df = check_nan(df[[ind, dep]], nan_policy) # Convert pandas nullable types for independent variables as this breaks statsmodels df = convert_pandas_nullable(df) # Ensure 2 groups for ind group1, data1, group2, data2 = as_2groups(df, dep, ind) # Do Mann-Whitney test # Stata does not perform continuity correction result = stats.mannwhitneyu(data1, data2, use_continuity=use_continuity, alternative=alternative, method=method) u1 = result.statistic u2 = len(data1) * len(data2) - u1 r = abs(2*u1 / (len(data1) * len(data2)) - 1) # rank-biserial # If significant, perform a Brunner-Munzel test for our interest if result.pvalue < 0.05 and brunnermunzel: result_bm = stats.brunnermunzel(data1, data2) if result_bm.pvalue >= 0.05: warnings.warn('Mann-Whitney test is significant but Brunner-Munzel test is not. This could be due to a difference in shape, rather than location.') return MannWhitneyResult( statistic=min(u1, u2), pvalue=result.pvalue, #med1=data1.median(), med2=data2.median(), rank_biserial=r, direction=('{1} > {0}' if u1 < u2 else '{0} > {1}').format(group1, group2), brunnermunzel=BrunnerMunzelResult(statistic=result_bm.statistic, pvalue=result_bm.pvalue)) return MannWhitneyResult( statistic=min(u1, u2), pvalue=result.pvalue, #med1=data1.median(), med2=data2.median(), rank_biserial=r, direction=('{1} > {0}' if u1 < u2 else '{0} > {1}').format(group1, group2)) # ------------------------ # Pearson chi-squared test class PearsonChiSquaredResult: """ Result of a Pearson *χ*:sup:`2` test See :func:`yli.chi2`. """ def __init__(self, ct, statistic, dof, pvalue, oddsratio=None, riskratio=None): #: Contingency table for the observations (*DataFrame*) self.ct = ct #: *χ*:sup:`2` statistic (*float*) self.statistic = statistic #: Degrees of freedom for the *χ*:sup:`2` distribution (*int*) self.dof = dof #: *p* value for the *χ*:sup:`2` test (*float*) self.pvalue = pvalue #: Odds ratio (*float*; *None* if not a 2×2 table) self.oddsratio = oddsratio #: Risk ratio (*float*; *None* if not a 2×2 table) self.riskratio = riskratio def __repr__(self): if config.repr_is_summary: return self.summary() return super().__repr__() def _repr_html_(self): if self.oddsratio is not None: return '{0}
χ2({1}) = {2:.2f}; p {3}
OR ({4:g}% CI) = {5}
RR ({4:g}% CI) = {6}'.format( self.ct._repr_html_(), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML), (1-config.alpha)*100, self.oddsratio.summary(), self.riskratio.summary()) else: return '{}
χ2({}) = {:.2f}; p {}'.format( self.ct._repr_html_(), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML)) def summary(self): """ Return a stringified summary of the *χ*:sup:`2` test :rtype: str """ if self.oddsratio is not None: return '{0}\n\nχ²({1}) = {2:.2f}; p {3}\nOR ({4:g}% CI) = {5}\nRR ({4:g}% CI) = {6}'.format( self.ct, self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION), (1-config.alpha)*100, self.oddsratio.summary(), self.riskratio.summary()) else: return '{}\n\nχ²({}) = {:.2f}; p {}'.format( self.ct, self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION)) def chi2(df, dep, ind, *, nan_policy='warn'): """ Perform a Pearson *χ*:sup:`2` test If a 2×2 contingency table is obtained (i.e. if both variables are dichotomous), an odds ratio and risk ratio are calculated. The ratios are calculated for the higher-valued value in each variable (i.e. *True* compared with *False* for a boolean). The risk ratio is calculated relative to the independent variable (rows of the contingency table). :param df: Data to perform the test on :type df: DataFrame :param dep: Column in *df* for the dependent variable (categorical) :type dep: str :param ind: Column in *df* for the independent variable (categorical) :type ind: str :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) :type nan_policy: str :rtype: :class:`yli.sig_tests.PearsonChiSquaredResult` **Example:** .. code-block:: df = pd.DataFrame({ 'Response': np.repeat([False, True, False, True], [250, 750, 400, 1600]), 'Stress': np.repeat([False, False, True, True], [250, 750, 400, 1600]) }) yli.chi2(df, 'Stress', 'Response') .. code-block:: text Stress False True Response False 250 400 True 750 1600 χ²(1) = 9.82; p = 0.002* OR (95% CI) = 1.33 (1.11–1.60) RR (95% CI) = 1.11 (1.03–1.18) The output shows the contingency table, and states that the value of the Pearson *χ*:sup:`2` statistic is 9.82, the *χ*:sup:`2` distribution has 1 degree of freedom, and the test is significant with *p* value 0.002. The odds of *Stress* in the *Response* = *True* group are 1.33 times that in the *Response* = *False* group, with 95% confidence interval 1.11–1.60. The risk of *Stress* in the *Response* = *True* group is 1.11 that in the *Response* = *False* group, with 95% confidence interval 1.03–1.18. """ # Check for/clean NaNs df = check_nan(df[[ind, dep]], nan_policy) # Compute contingency table ct = pd.crosstab(df[ind], df[dep]) # Get expected counts expected = stats.contingency.expected_freq(ct) # Warn on low expected counts if (expected < 5).sum() / expected.size > 0.2: warnings.warn('{} of {} cells ({:.0f}%) have expected count < 5'.format((expected < 5).sum(), expected.size, (expected < 5).sum() / expected.size * 100)) if (expected < 1).any(): warnings.warn('{} cells have expected count < 1'.format((expected < 1).sum())) if ct.shape == (2,2): # 2x2 table # Use statsmodels to get OR and RR smct = sm.stats.Table2x2(np.flip(ct.to_numpy()), shift_zeros=False) result = smct.test_nominal_association() ORci = smct.oddsratio_confint(config.alpha) RRci = smct.riskratio_confint(config.alpha) return PearsonChiSquaredResult( ct=ct, statistic=result.statistic, dof=result.df, pvalue=result.pvalue, oddsratio=Estimate(smct.oddsratio, ORci[0], ORci[1]), riskratio=Estimate(smct.riskratio, RRci[0], RRci[1])) else: # rxc table # Just use SciPy result = stats.chi2_contingency(ct, correction=False) return PearsonChiSquaredResult(ct=ct, statistic=result[0], dof=result[2], pvalue=result[1]) # ------------------- # Pearson correlation class PearsonRResult: """ Result of Pearson correlation See :func:`yli.pearsonr`. """ def __init__(self, statistic, pvalue): #: Pearson *r* correlation statistic (*float*) self.statistic = statistic #: *p* value for the *r* statistic (*float*) self.pvalue = pvalue def __repr__(self): if config.repr_is_summary: return self.summary() return super().__repr__() def _repr_html_(self): return 'r ({:g}% CI) = {}; p {}'.format((1-config.alpha)*100, self.statistic.summary(), fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML)) def summary(self): """ Return a stringified summary of the Pearson correlation :rtype: str """ return 'r ({:g}% CI) = {}; p {}'.format((1-config.alpha)*100, self.statistic.summary(), fmt_p(self.pvalue, PValueStyle.RELATION)) def pearsonr(df, dep, ind, *, nan_policy='warn'): """ Compute the Pearson correlation coefficient (Pearson's *r*) :param df: Data to perform the test on :type df: DataFrame :param dep: Column in *df* for the dependent variable (numerical) :type dep: str :param ind: Column in *df* for the independent variable (numerical) :type ind: str :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) :type nan_policy: str :rtype: :class:`yli.sig_tests.PearsonRResult` **Example:** .. code-block:: df = pd.DataFrame({ 'y': [41, 39, 47, 51, 43, 40, 57, 46, 50, 59, 61, 52], 'x': [24, 30, 33, 35, 36, 36, 37, 37, 38, 40, 43, 49] }) yli.pearsonr(df, 'y', 'x') .. code-block:: text r (95% CI) = 0.65 (0.11–0.89); p = 0.02* The output states that the value of the Pearson correlation coefficient is 0.65, with 95% confidence interval 0.11–0.89, and the test is significant with *p* value 0.02. """ # Check for/clean NaNs df = check_nan(df[[ind, dep]], nan_policy) # Compute Pearson's r result = stats.pearsonr(df[ind], df[dep]) ci = result.confidence_interval() return PearsonRResult(statistic=Estimate(result.statistic, ci.low, ci.high), pvalue=result.pvalue)