# scipy-yli: Helpful SciPy utilities and recipes # Copyright © 2022–2023 Lee Yingtong Li (RunasSudo) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import numpy as np import pandas as pd from scipy import stats import statsmodels.api as sm import functools import warnings from .config import config from .utils import Estimate, Interval, PValueStyle, as_2groups, as_numeric, check_nan, convert_pandas_nullable, fmt_p # ---------------- # Student's t test class TTestResult: """ Result of a Student's *t* test See :func:`yli.ttest_ind`. """ def __init__(self, *, statistic, dof, pvalue, dep, ind, group1, group2, mu1, mu2, sd1, sd2, delta, delta_direction): #: *t* statistic (*float*) self.statistic = statistic #: Degrees of freedom of the *t* distribution (*int*) self.dof = dof #: *p* value for the *t* statistic (*float*) self.pvalue = pvalue #: Name of the dependent variable (*str*) self.dep = dep #: Name of the independent variable (*str*) self.ind = ind #: Name of the first group (*str*) self.group1 = group1 #: Name of the second group (*str*) self.group2 = group2 #: Mean of the first group (*float*) self.mu1 = mu1 #: Mean of the second group (*float*) self.mu2 = mu2 #: Standard deviation of the first group (*float*) self.sd1 = sd1 #: Standard deviation of the second group (*float*) self.sd2 = sd2 #: Absolute value of the mean difference (:class:`yli.utils.Estimate`) self.delta = delta #: Description of the direction of the effect (*str*) self.delta_direction = delta_direction def _comparison_table(self, html): """Return a table showing the means/SDs for each group""" table_data = [[ '{:.2f} ({:.2f})'.format(self.mu1, self.sd1), '{:.2f} ({:.2f})'.format(self.mu2, self.sd2), ]] if html: table = pd.DataFrame(table_data, index=pd.Index(['\ue000 (SD)'], name=self.dep), columns=pd.Index([self.group1, self.group2], name=self.ind)) # U+E000 is in Private Use Area, mark μ symbol table_str = table._repr_html_() return table_str.replace('\ue000', 'μ') else: table = pd.DataFrame(table_data, index=pd.Index(['μ (SD)'], name=self.dep), columns=pd.Index([self.group1, self.group2], name=self.ind)) return str(table) def __repr__(self): if config.repr_is_summary: return self.summary() return super().__repr__() def _repr_html_(self): return '{}
t({:.0f}) = {:.2f}; p {}
Δμ ({:g}% CI) = {}, {}'.format(self._comparison_table(True), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML), (1-config.alpha)*100, self.delta.summary(), self.delta_direction) def summary(self): """ Return a stringified summary of the *t* test :rtype: str """ return '{}\n\nt({:.0f}) = {:.2f}; p {}\nΔμ ({:g}% CI) = {}, {}'.format(self._comparison_table(False), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION), (1-config.alpha)*100, self.delta.summary(), self.delta_direction) def summary_short(self, html): """ Return a stringified summary of the *t* test (*t* statistic only) :rtype: str """ if html: return 't({:.0f}) = {:.2f}'.format(self.dof, self.statistic) else: return 't({:.0f}) = {:.2f}'.format(self.dof, self.statistic) def ttest_ind(df, dep, ind, *, nan_policy='warn'): """ Perform an independent 2-sample Student's *t* test :param df: Data to perform the test on :type df: DataFrame :param dep: Column in *df* for the dependent variable (numeric) :type dep: str :param ind: Column in *df* for the independent variable (dichotomous) :type ind: str :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) :type nan_policy: str :rtype: :class:`yli.sig_tests.TTestResult` **Example:** .. code-block:: df = pd.DataFrame({ 'Type': ['Fresh'] * 10 + ['Stored'] * 10, 'Potency': [10.2, 10.5, 10.3, ...] }) yli.ttest_ind(df, 'Potency', 'Type') .. code-block:: text Type Fresh Stored Potency μ (SD) 10.37 (0.32) 9.83 (0.24) t(18) = 4.24; p < 0.001* Δμ (95% CI) = 0.54 (0.27–0.81), Fresh > Stored The output states that the value of the *t* statistic is 4.24, the *t* distribution has 18 degrees of freedom, and the test is significant with *p* value < 0.001. The mean difference is 0.54 in favour of the *Fresh* group, with 95% confidence interval 0.27–0.81. """ # Check for/clean NaNs df = check_nan(df[[ind, dep]], nan_policy) # Ensure 2 groups for ind group1, data1, group2, data2 = as_2groups(df, dep, ind) # Do t test # Use statsmodels rather than SciPy because this provides the mean difference automatically d1 = sm.stats.DescrStatsW(data1, ddof=1) d2 = sm.stats.DescrStatsW(data2, ddof=1) cm = sm.stats.CompareMeans(d1, d2) statistic, pvalue, dof = cm.ttest_ind() delta = d1.mean - d2.mean ci0, ci1 = cm.tconfint_diff(config.alpha) # t test is symmetric so take absolute value if d2.mean > d1.mean: delta, ci0, ci1 = -delta, -ci1, -ci0 d1, d2 = d2, d1 group1, group2 = group2, group1 # Now group1 > group2 return TTestResult( statistic=abs(statistic), dof=dof, pvalue=pvalue, dep=dep, ind=ind, group1=group1, group2=group2, mu1=d1.mean, mu2=d2.mean, sd1=d1.std, sd2=d2.std, delta=Estimate(delta, ci0, ci1), delta_direction='{} > {}'.format(group1, group2)) class MultipleTTestResult: """ Result of multiple Student's *t* tests, adjusted for multiplicity See :func:`yli.ttest_ind_multiple`. """ def __init__(self, *, dep, results): #: Name of the dependent variable (*str*) self.dep = dep #: Results of the *t* tests (*List[*\ :class:`TTestResult`\ *]*) self.results = results def _comparison_table(self, html): """Return a table showing the means/SDs for each group""" group1 = self.results[0].group1 group2 = self.results[0].group2 # TODO: Render HTML directly so can have proper HTML p values table_data = [] for row in self.results: cell1 = '{:.2f} ({:.2f})'.format(row.mu1, row.sd1) cell2 = '{:.2f} ({:.2f})'.format(row.mu2, row.sd2) cell_pvalue = fmt_p(row.pvalue, PValueStyle.TABULAR) # Display the cells the right way around if row.group1 == group1 and row.group2 == group2: table_data.append([cell1, cell2, cell_pvalue]) elif row.group1 == group2 and row.group2 == group1: table_data.append([cell2, cell1, cell_pvalue]) else: raise Exception('t tests have different groups') if html: table = pd.DataFrame(table_data, index=pd.Index([row.ind for row in self.results], name='\ue000 (SD)'), columns=pd.Index([self.results[0].group1, self.results[0].group2, '\ue001'], name=self.dep)) # U+E000 is in Private Use Area, mark μ symbol table_str = table._repr_html_() return table_str.replace('\ue000', 'μ').replace('\ue001', 'p') else: table = pd.DataFrame(table_data, index=pd.Index([row.ind for row in self.results], name='μ (SD)'), columns=pd.Index([self.results[0].group1, self.results[0].group2, 'p'], name=self.dep)) return str(table) def __repr__(self): if config.repr_is_summary: return self.summary() return super().__repr__() def _repr_html_(self): return self._comparison_table(True) def summary(self): """ Return a stringified summary of the *t* tests :rtype: str """ return str(self._comparison_table(False)) def ttest_ind_multiple(df, dep, inds, *, nan_policy='warn', method='hs'): """ Perform independent 2-sample Student's *t* tests with multiple independent variables, adjusting for multiplicity :param df: Data to perform the test on :type df: DataFrame :param dep: Column in *df* for the dependent variable (numeric) :type dep: str :param ind: Columns in *df* for the independent variables (dichotomous) :type ind: List[str] :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) :type nan_policy: str :param method: Method to apply for multiplicity adjustment (see `statsmodels multipletests `_) :type method: str :rtype: :class:`yli.sig_tests.MultipleTTestResult` """ # TODO: Unit testing # FIXME: Assert groups of independent variables have same levels # Perform t tests results = [] for ind in inds: results.append(ttest_ind(df, dep, ind, nan_policy=nan_policy)) # Adjust for multiplicity _, pvalues_corrected, _, _ = sm.stats.multipletests([result.pvalue for result in results], alpha=config.alpha, method=method) for result, pvalue_corrected in zip(results, pvalues_corrected): result.pvalue = pvalue_corrected return MultipleTTestResult(dep=dep, results=results) # ------------- # One-way ANOVA class FTestResult: """ Result of an *F* test for ANOVA/regression See :func:`yli.anova_oneway` and :meth:`yli.regress.RegressionModel.ftest`. """ def __init__(self, statistic, dof1, dof2, pvalue): #: *F* statistic (*float*) self.statistic = statistic #: Degrees of freedom in the *F* distribution numerator (*int*) self.dof1 = dof1 #: Degrees of freedom in the *F* distribution denominator (*int*) self.dof2 = dof2 #: *p* value for the *F* statistic (*float*) self.pvalue = pvalue def __repr__(self): if config.repr_is_summary: return self.summary() return super().__repr__() def _repr_html_(self): return 'F({:.0f}, {:.0f}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML)) def summary(self): """ Return a stringified summary of the *F* test :rtype: str """ return 'F({:.0f}, {:.0f}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION)) def anova_oneway(df, dep, ind, *, nan_policy='warn'): """ Perform one-way ANOVA :param df: Data to perform the test on :type df: DataFrame :param dep: Column in *df* for the dependent variable (numeric) :type dep: str :param ind: Column in *df* for the independent variable (categorical) :type ind: str :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) :type nan_policy: str :rtype: :class:`yli.sig_tests.FTestResult` **Example:** .. code-block:: df = pd.DataFrame({ 'Method': [1]*8 + [2]*7 + [3]*9, 'Score': [96, 79, 91, ...] }) yli.anova_oneway(df, 'Score', 'Method') .. code-block:: text F(2, 21) = 29.57; p < 0.001* The output states that the value of the *F* statistic is 29.57, the *F* distribution has 2 degrees of freedom in the numerator and 21 in the denominator, and the test is significant with *p* value < 0.001. """ # Check for/clean NaNs df = check_nan(df[[ind, dep]], nan_policy) # Group by independent variable groups = df.groupby(ind)[dep] # Perform one-way ANOVA result = stats.f_oneway(*[groups.get_group(k) for k in groups.groups]) # See stats.f_oneway implementation dfbn = len(groups.groups) - 1 dfwn = len(df) - len(groups.groups) return FTestResult(result.statistic, dfbn, dfwn, result.pvalue) # ----------------- # Mann-Whitney test class MannWhitneyResult: """ Result of a Mann–Whitney *U* test See :func:`yli.mannwhitney`. """ def __init__(self, *, statistic, pvalue, dep, ind, group1, group2, med1, med2, iqr1, iqr2, range1, range2, rank_biserial, direction, brunnermunzel=None): #: Lesser of the two Mann–Whitney *U* statistics (*float*) self.statistic = statistic #: *p* value for the *U* statistic (*float*) self.pvalue = pvalue #: Name of the dependent variable (*str*) self.dep = dep #: Name of the independent variable (*str*) self.ind = ind #: Name of the first group (*str*) self.group1 = group1 #: Name of the second group (*str*) self.group2 = group2 #: Median of the first group (*float*) self.med1 = med1 #: Median of the second group (*float*) self.med2 = med2 #: Interquartile range of the first group (:class:`yli.utils.Interval`) self.iqr1 = iqr1 #: Interquartile range of the second group (:class:`yli.utils.Interval`) self.iqr2 = iqr2 #: Range of the first group (:class:`yli.utils.Interval`) self.range1 = range2 #: Range of the second group (:class:`yli.utils.Interval`) self.range2 = range2 #: Absolute value of the rank-biserial correlation (*float*) self.rank_biserial = rank_biserial #: Description of the direction of the effect (*str*) self.direction = direction #: :class:`BrunnerMunzelResult` on the same data, or *None* if N/A self.brunnermunzel = brunnermunzel def _comparison_table(self, html): """Return a table showing the medians/IQRs/ranges for each group""" table_data = [ ['{:.2f} ({})'.format(self.med1, self.iqr1.summary()), '{:.2f} ({})'.format(self.med2, self.iqr2.summary())], ['{:.2f} ({})'.format(self.med1, self.range1.summary()), '{:.2f} ({})'.format(self.med2, self.range2.summary())], ] table = pd.DataFrame(table_data, index=pd.Index(['Median (IQR)', 'Median (range)'], name=self.dep), columns=pd.Index([self.group1, self.group2], name=self.ind)) if html: return table._repr_html_() else: return str(table) def __repr__(self): if config.repr_is_summary: return self.summary() return super().__repr__() def _repr_html_(self): line1 = '{}
U = {:.1f}; p {}
r = {:.2f}, {}'.format(self._comparison_table(True), self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML), self.rank_biserial, self.direction) if self.brunnermunzel: return line1 + '
' + self.brunnermunzel._repr_html_() else: return line1 def summary(self): """ Return a stringified summary of the Mann–Whitney test :rtype: str """ line1 = '{}\n\nU = {:.1f}; p {}\nr = {:.2f}, {}'.format(self._comparison_table(False), self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION), self.rank_biserial, self.direction) if self.brunnermunzel: return line1 + '\n' + self.brunnermunzel.summary() else: return line1 def summary_short(self, html): """ Return a stringified summary of the Mann–Whitney test (*U* statistic only) :rtype: str """ if html: return 'U = {:.1f}'.format(self.statistic) else: return 'U = {:.1f}'.format(self.statistic) class BrunnerMunzelResult: """ Result of a Brunner–Munzel test See :func:`yli.mannwhitney`. This library calls the Brunner–Munzel test statistic *W*. """ """Result of a Brunner-Munzel test""" def __init__(self, statistic, pvalue): #: *W* statistic (*float*) self.statistic = statistic #: *p* value for the *W* statistic (*float*) self.pvalue = pvalue def __repr__(self): if config.repr_is_summary: return self.summary() return super().__repr__() def _repr_html_(self): return 'W = {:.1f}; p {}'.format(self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML)) def summary(self): """ Return a stringified summary of the Brunner–Munzel test :rtype: str """ return 'W = {:.1f}; p {}'.format(self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION)) def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_continuity=False, alternative='two-sided', method='auto'): """ Perform a Mann–Whitney *U* test By default, this function performs a Brunner–Munzel test if the Mann–Whitney test is significant. If the Mann–Whitney test is significant but the Brunner–Munzel test is not, a warning is raised. The Brunner–Munzel test is returned only if non-significant. :param df: Data to perform the test on :type df: DataFrame :param dep: Column in *df* for the dependent variable (numeric) :type dep: str :param ind: Column in *df* for the independent variable (dichotomous) :type ind: str :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) :type nan_policy: str :param brunnermunzel: Whether to compute the Brunner–Munzel test if the Mann–Whitney test is significant :type brunnermunzel: bool :param use_continuity: See *scipy.stats.mannwhitneyu* :param alternative: See *scipy.stats.mannwhitneyu* :param method: See *scipy.stats.mannwhitneyu* :return: The result of the Mann–Whitney test. The result of a Brunner–Munzel test is included in the result object if and only if *brunnermunzel* is *True*, *and* the Mann–Whitney test is significant, *and* the Brunner–Munzel test is non-significant. :rtype: :class:`yli.sig_tests.MannWhitneyResult` **Example:** .. code-block:: df = pd.DataFrame({ 'Sample': ['Before'] * 12 + ['After'] * 12, 'Oxygen': [11.0, 11.2, 11.2, ...] }) yli.mannwhitney(df, 'Oxygen', 'Sample', method='asymptotic', alternative='less') .. code-block:: text Sample After Before Oxygen Median (IQR) 10.75 (10.55–10.95) 11.55 (11.20–11.83) Median (range) 10.75 (11.00–12.10) 11.55 (11.00–12.10) U = 6.0; p < 0.001* r = 0.92, Before > After The output states that the value of the Mann–Whitney *U* statistic is 6.0, and the one-sided test is significant with asymptotic *p* value < 0.001. The rank-biserial correlation is 0.92 in favour of the *Before* group. """ # Check for/clean NaNs df = check_nan(df[[ind, dep]], nan_policy) # Convert pandas nullable types for independent variables as this breaks mannwhitneyu df = convert_pandas_nullable(df) # Ensure 2 groups for ind group1, data1, group2, data2 = as_2groups(df, dep, ind) # Ensure numeric, factorising if required data1, _ = as_numeric(data1) data2, _ = as_numeric(data2) # Do Mann-Whitney test # Stata does not perform continuity correction result = stats.mannwhitneyu(data1, data2, use_continuity=use_continuity, alternative=alternative, method=method) u1 = result.statistic u2 = len(data1) * len(data2) - u1 r = abs(2*u1 / (len(data1) * len(data2)) - 1) # rank-biserial # If significant, perform a Brunner-Munzel test for our interest if result.pvalue < 0.05 and brunnermunzel: result_bm = stats.brunnermunzel(data1, data2) if result_bm.pvalue >= 0.05: warnings.warn('Mann-Whitney test is significant but Brunner-Munzel test is not. This could be due to a difference in shape, rather than location.') return MannWhitneyResult( statistic=min(u1, u2), pvalue=result.pvalue, #med1=data1.median(), med2=data2.median(), rank_biserial=r, direction=('{1} > {0}' if u1 < u2 else '{0} > {1}').format(group1, group2), brunnermunzel=BrunnerMunzelResult(statistic=result_bm.statistic, pvalue=result_bm.pvalue)) return MannWhitneyResult( statistic=min(u1, u2), pvalue=result.pvalue, dep=dep, ind=ind, group1=group1, group2=group2, med1=data1.median(), med2=data2.median(), iqr1=Interval(data1.quantile(0.25), data1.quantile(0.75)), iqr2=Interval(data2.quantile(0.25), data2.quantile(0.75)), range1=Interval(data1.min(), data1.max()), range2=Interval(data2.min(), data2.max()), rank_biserial=r, direction=('{1} > {0}' if u1 < u2 else '{0} > {1}').format(group1, group2)) # ------------------------ # Pearson chi-squared test class ChiSquaredResult: """ Result of a generic test with *χ*:sup:`2`-distributed test statistic See :meth:`yli.logrank`, :meth:`yli.regress.RegressionModel.deviance_chi2`. See also :class:`yli.regress.BrantResult`, :class:`yli.regress.LikelihoodRatioTestResult`, :class:`PearsonChiSquaredResult`. """ def __init__(self, statistic, dof, pvalue): #: *χ*:sup:`2` statistic (*float*) self.statistic = statistic #: Degrees of freedom for the *χ*:sup:`2` distribution (*int*) self.dof = dof #: *p* value for the *χ*:sup:`2` test (*float*) self.pvalue = pvalue def __repr__(self): if config.repr_is_summary: return self.summary() return super().__repr__() def _repr_html_(self): return 'χ²({}) = {:.2f}; p {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML)) def summary(self): """ Return a stringified summary of the *χ*:sup:`2` test :rtype: str """ return 'χ²({}) = {:.2f}; p {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION)) class PearsonChiSquaredResult(ChiSquaredResult): """ Result of a Pearson *χ*:sup:`2` test See :func:`yli.chi2`. """ def __init__(self, ct, statistic, dof, pvalue, oddsratio=None, riskratio=None): super().__init__(statistic, dof, pvalue) #: Contingency table for the observations (*DataFrame*) self.ct = ct #: Odds ratio (*float*; *None* if not a 2×2 table) self.oddsratio = oddsratio #: Risk ratio (*float*; *None* if not a 2×2 table) self.riskratio = riskratio def _repr_html_(self): if self.oddsratio is not None: return '{0}
χ²({1}) = {2:.2f}; p {3}
OR ({4:g}% CI) = {5}
RR ({4:g}% CI) = {6}'.format( self.ct._repr_html_(), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML), (1-config.alpha)*100, self.oddsratio.summary(), self.riskratio.summary()) else: return '{}
χ²({}) = {:.2f}; p {}'.format( self.ct._repr_html_(), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML)) def summary(self): """ Return a stringified summary of the *χ*:sup:`2` test :rtype: str """ if self.oddsratio is not None: return '{0}\n\nχ²({1}) = {2:.2f}; p {3}\nOR ({4:g}% CI) = {5}\nRR ({4:g}% CI) = {6}'.format( self.ct, self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION), (1-config.alpha)*100, self.oddsratio.summary(), self.riskratio.summary()) else: return '{}\n\nχ²({}) = {:.2f}; p {}'.format( self.ct, self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION)) def summary_short(self, html): """ Return a stringified summary of the *χ*:sup:`2` test (*χ*:sup:`2` statistic only) :rtype: str """ if html: return 'χ²({}) = {:.2f}'.format(self.dof, self.statistic) else: return 'χ²({}) = {:.2f}'.format(self.dof, self.statistic) def chi2(df, dep, ind, *, nan_policy='warn'): """ Perform a Pearson *χ*:sup:`2` test If a 2×2 contingency table is obtained (i.e. if both variables are dichotomous), an odds ratio and risk ratio are calculated. The ratios are calculated for the higher-valued value in each variable (i.e. *True* compared with *False* for a boolean). The risk ratio is calculated relative to the independent variable (rows of the contingency table). :param df: Data to perform the test on :type df: DataFrame :param dep: Column in *df* for the dependent variable (categorical) :type dep: str :param ind: Column in *df* for the independent variable (categorical) :type ind: str :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) :type nan_policy: str :rtype: :class:`yli.sig_tests.PearsonChiSquaredResult` **Example:** .. code-block:: df = pd.DataFrame({ 'Response': np.repeat([False, True, False, True], [250, 750, 400, 1600]), 'Stress': np.repeat([False, False, True, True], [250, 750, 400, 1600]) }) yli.chi2(df, 'Stress', 'Response') .. code-block:: text Stress False True Response False 250 400 True 750 1600 χ²(1) = 9.82; p = 0.002* OR (95% CI) = 1.33 (1.11–1.60) RR (95% CI) = 1.11 (1.03–1.18) The output shows the contingency table, and states that the value of the Pearson *χ*:sup:`2` statistic is 9.82, the *χ*:sup:`2` distribution has 1 degree of freedom, and the test is significant with *p* value 0.002. The odds of *Stress* in the *Response* = *True* group are 1.33 times that in the *Response* = *False* group, with 95% confidence interval 1.11–1.60. The risk of *Stress* in the *Response* = *True* group is 1.11 times that in the *Response* = *False* group, with 95% confidence interval 1.03–1.18. """ # Check for/clean NaNs df = check_nan(df[[ind, dep]], nan_policy) # Compute contingency table ct = pd.crosstab(df[ind], df[dep]) # Get expected counts expected = stats.contingency.expected_freq(ct) # Warn on low expected counts if (expected < 5).sum() / expected.size > 0.2: warnings.warn('{} of {} cells ({:.0f}%) have expected count < 5'.format((expected < 5).sum(), expected.size, (expected < 5).sum() / expected.size * 100)) if (expected < 1).any(): warnings.warn('{} cells have expected count < 1'.format((expected < 1).sum())) if ct.shape == (2,2): # 2x2 table # Use statsmodels to get OR and RR smct = sm.stats.Table2x2(np.flip(ct.to_numpy()), shift_zeros=False) result = smct.test_nominal_association() ORci = smct.oddsratio_confint(config.alpha) RRci = smct.riskratio_confint(config.alpha) return PearsonChiSquaredResult( ct=ct, statistic=result.statistic, dof=result.df, pvalue=result.pvalue, oddsratio=Estimate(smct.oddsratio, ORci[0], ORci[1]), riskratio=Estimate(smct.riskratio, RRci[0], RRci[1])) else: # rxc table # Just use SciPy result = stats.chi2_contingency(ct, correction=False) return PearsonChiSquaredResult(ct=ct, statistic=result[0], dof=result[2], pvalue=result[1]) # ------------------- # Pearson correlation class PearsonRResult: """ Result of Pearson product-moment correlation See :func:`yli.pearsonr`. """ def __init__(self, statistic, pvalue): #: Pearson *r* correlation statistic (:class:`yli.utils.Estimate`) self.statistic = statistic #: *p* value for the *r* statistic (*float*) self.pvalue = pvalue def __repr__(self): if config.repr_is_summary: return self.summary() return super().__repr__() def _repr_html_(self): return 'r ({:g}% CI) = {}; p {}'.format((1-config.alpha)*100, self.statistic.summary(), fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML)) def summary(self): """ Return a stringified summary of the Pearson correlation :rtype: str """ return 'r ({:g}% CI) = {}; p {}'.format((1-config.alpha)*100, self.statistic.summary(), fmt_p(self.pvalue, PValueStyle.RELATION)) def pearsonr(df, dep, ind, *, nan_policy='warn'): """ Compute the Pearson product-moment correlation coefficient (Pearson's *r*) :param df: Data to perform the test on :type df: DataFrame :param dep: Column in *df* for the dependent variable (numerical) :type dep: str :param ind: Column in *df* for the independent variable (numerical) :type ind: str :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) :type nan_policy: str :rtype: :class:`yli.sig_tests.PearsonRResult` **Example:** .. code-block:: df = pd.DataFrame({ 'y': [41, 39, 47, 51, 43, 40, 57, 46, 50, 59, 61, 52], 'x': [24, 30, 33, 35, 36, 36, 37, 37, 38, 40, 43, 49] }) yli.pearsonr(df, 'y', 'x') .. code-block:: text r (95% CI) = 0.65 (0.11–0.89); p = 0.02* The output states that the value of the Pearson correlation coefficient is 0.65, with 95% confidence interval 0.11–0.89, and the test is significant with *p* value 0.02. """ # Check for/clean NaNs df = check_nan(df[[ind, dep]], nan_policy) # Compute Pearson's r result = stats.pearsonr(df[ind], df[dep]) ci = result.confidence_interval() return PearsonRResult(statistic=Estimate(result.statistic, ci.low, ci.high), pvalue=result.pvalue) # -------------------- # Spearman correlation class SpearmanResult: """ Result of Spearman rank correlation See :func:`yli.spearman`. """ def __init__(self, statistic, pvalue): #: Spearman *ρ* correlation statistic (:class:`yli.utils.Estimate`) self.statistic = statistic #: *p* value for the *ρ* statistic (*float*) self.pvalue = pvalue def __repr__(self): if config.repr_is_summary: return self.summary() return super().__repr__() def _repr_html_(self): return 'ρ ({:g}% CI) = {}; p {}'.format((1-config.alpha)*100, self.statistic.summary(), fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML)) def summary(self): """ Return a stringified summary of the Spearman correlation :rtype: str """ return 'ρ ({:g}% CI) = {}; p {}'.format((1-config.alpha)*100, self.statistic.summary(), fmt_p(self.pvalue, PValueStyle.RELATION)) def spearman(df, dep, ind, *, nan_policy='warn'): """ Compute the Spearman rank correlation coefficient (Spearman's *ρ*) The confidence interval for *ρ* is computed analogously to SciPy's *pearsonr*, using the Fisher transformation and normal approximation, without adjustment to variance. :param df: Data to perform the test on :type df: DataFrame :param dep: Column in *df* for the dependent variable (numerical) :type dep: str :param ind: Column in *df* for the independent variable (numerical) :type ind: str :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) :type nan_policy: str :rtype: :class:`yli.sig_tests.SpearmanResult` **Example:** .. code-block:: df = pd.DataFrame({ 'Profit': [2.5, 6.2, 3.1, ...], 'Quality': [50, 57, 61, ...] }) yli.spearman(df, 'Profit', 'Quality') .. code-block:: text ρ (95% CI) = 0.87 (0.60–0.96); p < 0.001* The output states that the value of the Spearman correlation coefficient is 0.87, with 95% confidence interval 0.60–0.96, and the test is significant with *p* value < 0.001. """ # Check for/clean NaNs df = check_nan(df[[ind, dep]], nan_policy) # Ensure numeric, factorising categorical variables as required ind, _ = as_numeric(df[ind]) dep, _ = as_numeric(df[dep]) # Compute Spearman's rho result = stats.spearmanr(ind, dep) # Compute confidence interval ci = stats._stats_py._pearsonr_fisher_ci(result.correlation, len(dep), 1 - config.alpha, 'two-sided') return SpearmanResult(statistic=Estimate(result.correlation, ci.low, ci.high), pvalue=result.pvalue) # ---------------------------- # Automatic selection of tests class AutoBinaryResult: """ Result of automatically computed univariable tests of association for a dichotomous dependent variable See :func:`yli.auto_univariable`. Results data stored within instances of this class is not intended to be directly accessed. """ def __init__(self, *, dep, group1, group2, result_data, result_labels): #: Name of the dependent variable (*str*) self.dep = dep #: Name of the first group (*str*) self.group1 = group1 #: Name of the second group (*str*) self.group2 = group2 # List of tuples (first group summary, second group summary, test result) self._result_data = result_data # List of tuples (plaintext label, HTML label) self._result_labels = result_labels def __repr__(self): if config.repr_is_summary: return self.summary() return super().__repr__() def _repr_html_(self): result = ''.format(self.dep, self.group1, self.group2) for data, label in zip(self._result_data, self._result_labels): result += ''.format(label[1], data[0], data[1], data[2].summary_short(True), fmt_p(data[2].pvalue, PValueStyle.TABULAR | PValueStyle.HTML)) result += '

{}	{}	{}		p
{}	{}	{}	{}	{}

' return result def summary(self): """ Return a stringified summary of the tests of association :rtype: str """ # Format data for output result_data_fmt = [ [r[0], r[1], r[2].summary_short(False), fmt_p(r[2].pvalue, PValueStyle.TABULAR)] for r in self._result_data ] result_labels_fmt = [r[0] for r in self._result_labels] table = pd.DataFrame(result_data_fmt, index=result_labels_fmt, columns=pd.Index([self.group1, self.group2, '', 'p'], name=self.dep)) return str(table) def auto_univariable(df, dep, inds, *, nan_policy='warn'): """ Automatically compute univariable tests of association for a dichotomous dependent variable The tests performed are: * For a dichotomous independent variable – :func:`yli.chi2` * For a continuous independent variable – :func:`yli.ttest_ind` * For an ordinal independent variable – :func:`yli.mannwhitney` If *nan_policy* is *warn* or *omit*, rows with *nan* values are omitted only from the individual tests of association for the missing variables. :param df: Data to perform the test on :type df: DataFrame :param dep: Column in *df* for the dependent variable (dichotomous) :type dep: str :param inds: Columns in *df* for the independent variables :type inds: List[str] :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) :type nan_policy: str :rtype: :class:`yli.sig_tests.AutoBinaryResult` """ # Check for/clean NaNs in dependent variable df = check_nan(df[inds + [dep]], nan_policy, cols=[dep]) # Ensure 2 groups for dep # TODO: Work for non-binary dependent variables? group1, data1, group2, data2 = as_2groups(df, inds, dep) result_data = [] result_labels = [] for ind in inds: # Check for/clean NaNs in independent variable # Following this, we pass nan_policy='raise' to assert no NaNs remaining df_cleaned = check_nan(df, nan_policy, cols=[ind]) if df_cleaned[ind].dtype == 'category' and df_cleaned[ind].cat.ordered and df_cleaned[ind].cat.categories.dtype in ('float64', 'int64', 'Float64', 'Int64'): # Ordinal numeric data # Mann-Whitney test result = mannwhitney(df_cleaned, ind, dep, nan_policy='raise') result_labels.append(( '{}, median (IQR)'.format(ind), '{}, median (IQR)'.format(ind), )) result_data.append(( '{:.2f} ({})'.format(result.med1, result.iqr1.summary()), '{:.2f} ({})'.format(result.med2, result.iqr2.summary()), result )) elif df_cleaned[ind].dtype in ('bool', 'boolean', 'category', 'object'): # Categorical data # Pearson chi-squared test result = chi2(df_cleaned, dep, ind, nan_policy='raise') values = sorted(df_cleaned[ind].unique()) # Value counts result_labels.append(( '{}, {}'.format(ind, ':'.join(str(v) for v in values)), '{}, {}'.format(ind, ':'.join(str(v) for v in values)), )) result_data.append(( ':'.join(str((data1[ind] == v).sum()) for v in values), ':'.join(str((data2[ind] == v).sum()) for v in values), result )) elif df_cleaned[ind].dtype in ('float64', 'int64', 'Float64', 'Int64'): # Continuous data # t test result = ttest_ind(df_cleaned, ind, dep, nan_policy='raise') result_labels.append(( '{}, μ (SD)'.format(ind), '{}, μ (SD)'.format(ind), )) result_data.append(( '{:.2f} ({:.2f})'.format(result.mu1, result.sd1), '{:.2f} ({:.2f})'.format(result.mu2, result.sd2), result )) else: raise Exception('Unsupported independent dtype for auto_univariable, {}'.format(df[ind].dtype)) return AutoBinaryResult(dep=dep, group1=group1, group2=group2, result_data=result_data, result_labels=result_labels)