# scipy-yli: Helpful SciPy utilities and recipes # Copyright © 2022 Lee Yingtong Li (RunasSudo) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import numpy as np import pandas as pd from scipy import stats import statsmodels.api as sm import functools import warnings from .config import config from .utils import ConfidenceInterval, Estimate, PValueStyle, as_2groups, check_nan, convert_pandas_nullable, fmt_p # ---------------- # Student's t test class TTestResult: """ Result of a Student's *t* test See :func:`yli.ttest_ind`. """ def __init__(self, statistic, dof, pvalue, group1, group2, mu1, mu2, sd1, sd2, delta, delta_direction): #: *t* statistic (*float*) self.statistic = statistic #: Degrees of freedom of the *t* distribution (*int*) self.dof = dof #: *p* value for the *t* statistic (*float*) self.pvalue = pvalue #: Name of the first group (*str*) self.group1 = group1 #: Name of the second group (*str*) self.group2 = group2 #: Mean of the first group (*float*) self.mu1 = mu1 #: Mean of the second group (*float*) self.mu2 = mu2 #: Standard deviation of the first group (*float*) self.sd1 = sd1 #: Standard deviation of the second group (*float*) self.sd2 = sd2 #: Absolute value of the mean difference (:class:`yli.utils.Estimate`) self.delta = delta #: Description of the direction of the effect (*str*) self.delta_direction = delta_direction def _comparison_table(self, html): """Return a table showing the means/SDs for each group""" table_data = { self.group1: '{:.2f} ({:.2f})'.format(self.mu1, self.sd1), self.group2: '{:.2f} ({:.2f})'.format(self.mu2, self.sd2), } if html: table = pd.DataFrame(table_data, index=['\ue000 (SD)']) # U+E000 is in Private Use Area, mark μ symbol table_str = table._repr_html_() return table_str.replace('\ue000', 'μ') else: table = pd.DataFrame(table_data, index=['μ (SD)']) return str(table) def __repr__(self): if config.repr_is_summary: return self.summary() return super().__repr__() def _repr_html_(self): return '{}
t({:.0f}) = {:.2f}; p {}
Δμ ({:g}% CI) = {}, {}'.format(self._comparison_table(True), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML), (1-config.alpha)*100, self.delta.summary(), self.delta_direction) def summary(self): """ Return a stringified summary of the *t* test :rtype: str """ return '{}\n\nt({:.0f}) = {:.2f}; p {}\nΔμ ({:g}% CI) = {}, {}'.format(self._comparison_table(False), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION), (1-config.alpha)*100, self.delta.summary(), self.delta_direction) def ttest_ind(df, dep, ind, *, nan_policy='warn'): """ Perform an independent 2-sample Student's *t* test :param df: Data to perform the test on :type df: DataFrame :param dep: Column in *df* for the dependent variable (numeric) :type dep: str :param ind: Column in *df* for the independent variable (dichotomous) :type ind: str :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) :type nan_policy: str :rtype: :class:`yli.sig_tests.TTestResult` **Example:** .. code-block:: df = pd.DataFrame({ 'Type': ['Fresh'] * 10 + ['Stored'] * 10, 'Potency': [10.2, 10.5, 10.3, ...] }) yli.ttest_ind(df, 'Potency', 'Type') .. code-block:: text Fresh Stored μ (SD) 10.37 (0.32) 9.83 (0.24) t(18) = 4.24; p < 0.001* Δμ (95% CI) = 0.54 (0.27–0.81), Fresh > Stored The output states that the value of the *t* statistic is 4.24, the *t* distribution has 18 degrees of freedom, and the test is significant with *p* value < 0.001. The mean difference is 0.54 in favour of the *Fresh* group, with 95% confidence interval 0.27–0.81. """ # Check for/clean NaNs df = check_nan(df[[ind, dep]], nan_policy) # Ensure 2 groups for ind group1, data1, group2, data2 = as_2groups(df, dep, ind) # Do t test # Use statsmodels rather than SciPy because this provides the mean difference automatically d1 = sm.stats.DescrStatsW(data1, ddof=1) d2 = sm.stats.DescrStatsW(data2, ddof=1) cm = sm.stats.CompareMeans(d1, d2) statistic, pvalue, dof = cm.ttest_ind() delta = d1.mean - d2.mean ci0, ci1 = cm.tconfint_diff(config.alpha) # t test is symmetric so take absolute value if d2.mean > d1.mean: delta, ci0, ci1 = -delta, -ci1, -ci0 d1, d2 = d2, d1 group1, group2 = group2, group1 # Now group1 > group2 return TTestResult( statistic=abs(statistic), dof=dof, pvalue=pvalue, group1=group1, group2=group2, mu1=d1.mean, mu2=d2.mean, sd1=d1.std, sd2=d2.std, delta=Estimate(delta, ci0, ci1), delta_direction='{} > {}'.format(group1, group2)) # ------------- # One-way ANOVA class FTestResult: """ Result of an *F* test for ANOVA/regression See :func:`yli.anova_oneway` and :meth:`yli.regress.RegressionResult.ftest`. """ def __init__(self, statistic, dof1, dof2, pvalue): #: *F* statistic (*float*) self.statistic = statistic #: Degrees of freedom in the *F* distribution numerator (*int*) self.dof1 = dof1 #: Degrees of freedom in the *F* distribution denominator (*int*) self.dof2 = dof2 #: *p* value for the *F* statistic (*float*) self.pvalue = pvalue def __repr__(self): if config.repr_is_summary: return self.summary() return super().__repr__() def _repr_html_(self): return 'F({:.0f}, {:.0f}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML)) def summary(self): """ Return a stringified summary of the *F* test :rtype: str """ return 'F({:.0f}, {:.0f}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION)) def anova_oneway(df, dep, ind, *, nan_policy='omit'): """ Perform one-way ANOVA :param df: Data to perform the test on :type df: DataFrame :param dep: Column in *df* for the dependent variable (numeric) :type dep: str :param ind: Column in *df* for the independent variable (categorical) :type ind: str :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) :type nan_policy: str :rtype: :class:`yli.sig_tests.FTestResult` **Example:** .. code-block:: df = pd.DataFrame({ 'Method': [1]*8 + [2]*7 + [3]*9, 'Score': [96, 79, 91, ...] }) yli.anova_oneway(df, 'Score', 'Method') .. code-block:: text F(2, 21) = 29.57; p < 0.001* The output states that the value of the *F* statistic is 29.57, the *F* distribution has 2 degrees of freedom in the numerator and 21 in the denominator, and the test is significant with *p* value < 0.001. """ # Check for/clean NaNs df = check_nan(df[[ind, dep]], nan_policy) # Group by independent variable groups = df.groupby(ind)[dep] # Perform one-way ANOVA result = stats.f_oneway(*[groups.get_group(k) for k in groups.groups]) # See stats.f_oneway implementation dfbn = len(groups.groups) - 1 dfwn = len(df) - len(groups.groups) return FTestResult(result.statistic, dfbn, dfwn, result.pvalue) # ----------------- # Mann-Whitney test class MannWhitneyResult: """ Result of a Mann-Whitney *U* test See :func:`yli.mannwhitney`. """ def __init__(self, statistic, pvalue, group1, group2, med1, med2, iqr1, iqr2, range1, range2, rank_biserial, direction, brunnermunzel=None): #: Lesser of the two Mann–Whitney *U* statistics (*float*) self.statistic = statistic #: *p* value for the *U* statistic (*float*) self.pvalue = pvalue #: Name of the first group (*str*) self.group1 = group1 #: Name of the second group (*str*) self.group2 = group2 #: Median of the first group (*float*) self.med1 = med1 #: Median of the second group (*float*) self.med2 = med2 #: Interquartile range of the first group (:class:`yli.utils.ConfidenceInterval`) self.iqr1 = iqr1 #: Interquartile range of the second group (:class:`yli.utils.ConfidenceInterval`) self.iqr2 = iqr2 #: Range of the first group (:class:`yli.utils.ConfidenceInterval`) self.range1 = range2 #: Range of the second group (:class:`yli.utils.ConfidenceInterval`) self.range2 = range2 #: Absolute value of the rank-biserial correlation (*float*) self.rank_biserial = rank_biserial #: Description of the direction of the effect (*str*) self.direction = direction #: :class:`BrunnerMunzelResult` on the same data, or *None* if N/A self.brunnermunzel = brunnermunzel def _comparison_table(self, html): """Return a table showing the medians/IQRs/ranges for each group""" table_data = { self.group1: ['{:.2f} ({})'.format(self.med1, self.iqr1.summary()), '{:.2f} ({})'.format(self.med1, self.range1.summary())], self.group2: ['{:.2f} ({})'.format(self.med2, self.iqr2.summary()), '{:.2f} ({})'.format(self.med2, self.range2.summary())], } table = pd.DataFrame(table_data, index=['Median (IQR)', 'Median (range)']) if html: return table._repr_html_() else: return str(table) def __repr__(self): if config.repr_is_summary: return self.summary() return super().__repr__() def _repr_html_(self): line1 = '{}
U = {:.1f}; p {}
r = {:.2f}, {}'.format(self._comparison_table(True), self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML), self.rank_biserial, self.direction) if self.brunnermunzel: return line1 + '
' + self.brunnermunzel._repr_html_() else: return line1 def summary(self): """ Return a stringified summary of the Mann–Whitney test :rtype: str """ line1 = '{}\n\nU = {:.1f}; p {}\nr = {:.2f}, {}'.format(self._comparison_table(False), self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION), self.rank_biserial, self.direction) if self.brunnermunzel: return line1 + '\n' + self.brunnermunzel.summary() else: return line1 class BrunnerMunzelResult: """ Result of a Brunner–Munzel test See :func:`yli.mannwhitney`. This library calls the Brunner–Munzel test statistic *W*. """ """Result of a Brunner-Munzel test""" def __init__(self, statistic, pvalue): #: *W* statistic (*float*) self.statistic = statistic #: *p* value for the *W* statistic (*float*) self.pvalue = pvalue def __repr__(self): if config.repr_is_summary: return self.summary() return super().__repr__() def _repr_html_(self): return 'W = {:.1f}; p {}'.format(self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML)) def summary(self): """ Return a stringified summary of the Brunner–Munzel test :rtype: str """ return 'W = {:.1f}; p {}'.format(self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION)) def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_continuity=False, alternative='two-sided', method='auto'): """ Perform a Mann-Whitney *U* test By default, this function performs a Brunner–Munzel test if the Mann–Whitney test is significant. If the Mann–Whitney test is significant but the Brunner–Munzel test is not, a warning is raised. The Brunner–Munzel test is returned only if non-significant. :param df: Data to perform the test on :type df: DataFrame :param dep: Column in *df* for the dependent variable (numeric) :type dep: str :param ind: Column in *df* for the independent variable (dichotomous) :type ind: str :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) :type nan_policy: str :param brunnermunzel: Whether to compute the Brunner–Munzel test if the Mann–Whitney test is significant :type brunnermunzel: bool :param use_continuity: See *scipy.stats.mannwhitneyu* :param alternative: See *scipy.stats.mannwhitneyu* :param method: See *scipy.stats.mannwhitneyu* :return: The result of the Mann–Whitney test. The result of a Brunner–Munzel test is included in the result object if and only if *brunnermunzel* is *True*, *and* the Mann–Whitney test is significant, *and* the Brunner–Munzel test is non-significant. :rtype: :class:`yli.sig_tests.MannWhitneyResult` **Example:** .. code-block:: df = pd.DataFrame({ 'Sample': ['Before'] * 12 + ['After'] * 12, 'Oxygen': [11.0, 11.2, 11.2, ...] }) yli.mannwhitney(df, 'Oxygen', 'Sample', method='asymptotic', alternative='less') .. code-block:: text After Before Median (IQR) 10.75 (10.55–10.95) 11.55 (11.20–11.83) Median (range) 10.75 (11.00–12.10) 11.55 (11.00–12.10) U = 6.0; p < 0.001* r = 0.92, Before > After The output states that the value of the Mann–Whitney *U* statistic is 6.0, and the one-sided test is significant with asymptotic *p* value < 0.001. The rank-biserial correlation is 0.92 in favour of the *Before* group. """ # Check for/clean NaNs df = check_nan(df[[ind, dep]], nan_policy) # Convert pandas nullable types for independent variables as this breaks statsmodels df = convert_pandas_nullable(df) # Ensure 2 groups for ind group1, data1, group2, data2 = as_2groups(df, dep, ind) # Do Mann-Whitney test # Stata does not perform continuity correction result = stats.mannwhitneyu(data1, data2, use_continuity=use_continuity, alternative=alternative, method=method) u1 = result.statistic u2 = len(data1) * len(data2) - u1 r = abs(2*u1 / (len(data1) * len(data2)) - 1) # rank-biserial # If significant, perform a Brunner-Munzel test for our interest if result.pvalue < 0.05 and brunnermunzel: result_bm = stats.brunnermunzel(data1, data2) if result_bm.pvalue >= 0.05: warnings.warn('Mann-Whitney test is significant but Brunner-Munzel test is not. This could be due to a difference in shape, rather than location.') return MannWhitneyResult( statistic=min(u1, u2), pvalue=result.pvalue, #med1=data1.median(), med2=data2.median(), rank_biserial=r, direction=('{1} > {0}' if u1 < u2 else '{0} > {1}').format(group1, group2), brunnermunzel=BrunnerMunzelResult(statistic=result_bm.statistic, pvalue=result_bm.pvalue)) return MannWhitneyResult( statistic=min(u1, u2), pvalue=result.pvalue, group1=group1, group2=group2, med1=data1.median(), med2=data2.median(), iqr1=ConfidenceInterval(data1.quantile(0.25), data1.quantile(0.75)), iqr2=ConfidenceInterval(data2.quantile(0.25), data2.quantile(0.75)), range1=ConfidenceInterval(data1.min(), data1.max()), range2=ConfidenceInterval(data2.min(), data2.max()), rank_biserial=r, direction=('{1} > {0}' if u1 < u2 else '{0} > {1}').format(group1, group2)) # ------------------------ # Pearson chi-squared test class PearsonChiSquaredResult: """ Result of a Pearson *χ*:sup:`2` test See :func:`yli.chi2`. """ def __init__(self, ct, statistic, dof, pvalue, oddsratio=None, riskratio=None): #: Contingency table for the observations (*DataFrame*) self.ct = ct #: *χ*:sup:`2` statistic (*float*) self.statistic = statistic #: Degrees of freedom for the *χ*:sup:`2` distribution (*int*) self.dof = dof #: *p* value for the *χ*:sup:`2` test (*float*) self.pvalue = pvalue #: Odds ratio (*float*; *None* if not a 2×2 table) self.oddsratio = oddsratio #: Risk ratio (*float*; *None* if not a 2×2 table) self.riskratio = riskratio def __repr__(self): if config.repr_is_summary: return self.summary() return super().__repr__() def _repr_html_(self): if self.oddsratio is not None: return '{0}
χ2({1}) = {2:.2f}; p {3}
OR ({4:g}% CI) = {5}
RR ({4:g}% CI) = {6}'.format( self.ct._repr_html_(), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML), (1-config.alpha)*100, self.oddsratio.summary(), self.riskratio.summary()) else: return '{}
χ2({}) = {:.2f}; p {}'.format( self.ct._repr_html_(), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML)) def summary(self): """ Return a stringified summary of the *χ*:sup:`2` test :rtype: str """ if self.oddsratio is not None: return '{0}\n\nχ²({1}) = {2:.2f}; p {3}\nOR ({4:g}% CI) = {5}\nRR ({4:g}% CI) = {6}'.format( self.ct, self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION), (1-config.alpha)*100, self.oddsratio.summary(), self.riskratio.summary()) else: return '{}\n\nχ²({}) = {:.2f}; p {}'.format( self.ct, self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION)) def chi2(df, dep, ind, *, nan_policy='warn'): """ Perform a Pearson *χ*:sup:`2` test If a 2×2 contingency table is obtained (i.e. if both variables are dichotomous), an odds ratio and risk ratio are calculated. The ratios are calculated for the higher-valued value in each variable (i.e. *True* compared with *False* for a boolean). The risk ratio is calculated relative to the independent variable (rows of the contingency table). :param df: Data to perform the test on :type df: DataFrame :param dep: Column in *df* for the dependent variable (categorical) :type dep: str :param ind: Column in *df* for the independent variable (categorical) :type ind: str :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) :type nan_policy: str :rtype: :class:`yli.sig_tests.PearsonChiSquaredResult` **Example:** .. code-block:: df = pd.DataFrame({ 'Response': np.repeat([False, True, False, True], [250, 750, 400, 1600]), 'Stress': np.repeat([False, False, True, True], [250, 750, 400, 1600]) }) yli.chi2(df, 'Stress', 'Response') .. code-block:: text Stress False True Response False 250 400 True 750 1600 χ²(1) = 9.82; p = 0.002* OR (95% CI) = 1.33 (1.11–1.60) RR (95% CI) = 1.11 (1.03–1.18) The output shows the contingency table, and states that the value of the Pearson *χ*:sup:`2` statistic is 9.82, the *χ*:sup:`2` distribution has 1 degree of freedom, and the test is significant with *p* value 0.002. The odds of *Stress* in the *Response* = *True* group are 1.33 times that in the *Response* = *False* group, with 95% confidence interval 1.11–1.60. The risk of *Stress* in the *Response* = *True* group is 1.11 that in the *Response* = *False* group, with 95% confidence interval 1.03–1.18. """ # Check for/clean NaNs df = check_nan(df[[ind, dep]], nan_policy) # Compute contingency table ct = pd.crosstab(df[ind], df[dep]) # Get expected counts expected = stats.contingency.expected_freq(ct) # Warn on low expected counts if (expected < 5).sum() / expected.size > 0.2: warnings.warn('{} of {} cells ({:.0f}%) have expected count < 5'.format((expected < 5).sum(), expected.size, (expected < 5).sum() / expected.size * 100)) if (expected < 1).any(): warnings.warn('{} cells have expected count < 1'.format((expected < 1).sum())) if ct.shape == (2,2): # 2x2 table # Use statsmodels to get OR and RR smct = sm.stats.Table2x2(np.flip(ct.to_numpy()), shift_zeros=False) result = smct.test_nominal_association() ORci = smct.oddsratio_confint(config.alpha) RRci = smct.riskratio_confint(config.alpha) return PearsonChiSquaredResult( ct=ct, statistic=result.statistic, dof=result.df, pvalue=result.pvalue, oddsratio=Estimate(smct.oddsratio, ORci[0], ORci[1]), riskratio=Estimate(smct.riskratio, RRci[0], RRci[1])) else: # rxc table # Just use SciPy result = stats.chi2_contingency(ct, correction=False) return PearsonChiSquaredResult(ct=ct, statistic=result[0], dof=result[2], pvalue=result[1]) # ------------------- # Pearson correlation class PearsonRResult: """ Result of Pearson correlation See :func:`yli.pearsonr`. """ def __init__(self, statistic, pvalue): #: Pearson *r* correlation statistic (*float*) self.statistic = statistic #: *p* value for the *r* statistic (*float*) self.pvalue = pvalue def __repr__(self): if config.repr_is_summary: return self.summary() return super().__repr__() def _repr_html_(self): return 'r ({:g}% CI) = {}; p {}'.format((1-config.alpha)*100, self.statistic.summary(), fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML)) def summary(self): """ Return a stringified summary of the Pearson correlation :rtype: str """ return 'r ({:g}% CI) = {}; p {}'.format((1-config.alpha)*100, self.statistic.summary(), fmt_p(self.pvalue, PValueStyle.RELATION)) def pearsonr(df, dep, ind, *, nan_policy='warn'): """ Compute the Pearson correlation coefficient (Pearson's *r*) :param df: Data to perform the test on :type df: DataFrame :param dep: Column in *df* for the dependent variable (numerical) :type dep: str :param ind: Column in *df* for the independent variable (numerical) :type ind: str :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) :type nan_policy: str :rtype: :class:`yli.sig_tests.PearsonRResult` **Example:** .. code-block:: df = pd.DataFrame({ 'y': [41, 39, 47, 51, 43, 40, 57, 46, 50, 59, 61, 52], 'x': [24, 30, 33, 35, 36, 36, 37, 37, 38, 40, 43, 49] }) yli.pearsonr(df, 'y', 'x') .. code-block:: text r (95% CI) = 0.65 (0.11–0.89); p = 0.02* The output states that the value of the Pearson correlation coefficient is 0.65, with 95% confidence interval 0.11–0.89, and the test is significant with *p* value 0.02. """ # Check for/clean NaNs df = check_nan(df[[ind, dep]], nan_policy) # Compute Pearson's r result = stats.pearsonr(df[ind], df[dep]) ci = result.confidence_interval() return PearsonRResult(statistic=Estimate(result.statistic, ci.low, ci.high), pvalue=result.pvalue)