scipy-yli/yli/sig_tests.py

1049 lines
35 KiB
Python

# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
import functools
import warnings
from .config import config
from .utils import Estimate, Interval, PValueStyle, as_2groups, as_numeric, check_nan, convert_pandas_nullable, fmt_p
# ----------------
# Student's t test
class TTestResult:
"""
Result of a Student's *t* test
See :func:`yli.ttest_ind`.
"""
def __init__(self, *, statistic, dof, pvalue, dep, ind, group1, group2, mu1, mu2, sd1, sd2, delta, delta_direction):
#: *t* statistic (*float*)
self.statistic = statistic
#: Degrees of freedom of the *t* distribution (*int*)
self.dof = dof
#: *p* value for the *t* statistic (*float*)
self.pvalue = pvalue
#: Name of the dependent variable (*str*)
self.dep = dep
#: Name of the independent variable (*str*)
self.ind = ind
#: Name of the first group (*str*)
self.group1 = group1
#: Name of the second group (*str*)
self.group2 = group2
#: Mean of the first group (*float*)
self.mu1 = mu1
#: Mean of the second group (*float*)
self.mu2 = mu2
#: Standard deviation of the first group (*float*)
self.sd1 = sd1
#: Standard deviation of the second group (*float*)
self.sd2 = sd2
#: Absolute value of the mean difference (:class:`yli.utils.Estimate`)
self.delta = delta
#: Description of the direction of the effect (*str*)
self.delta_direction = delta_direction
def _comparison_table(self, html):
"""Return a table showing the means/SDs for each group"""
table_data = [[
'{:.2f} ({:.2f})'.format(self.mu1, self.sd1),
'{:.2f} ({:.2f})'.format(self.mu2, self.sd2),
]]
if html:
table = pd.DataFrame(table_data, index=pd.Index(['\ue000 (SD)'], name=self.dep), columns=pd.Index([self.group1, self.group2], name=self.ind)) # U+E000 is in Private Use Area, mark μ symbol
table_str = table._repr_html_()
return table_str.replace('\ue000', '<i>μ</i>')
else:
table = pd.DataFrame(table_data, index=pd.Index(['μ (SD)'], name=self.dep), columns=pd.Index([self.group1, self.group2], name=self.ind))
return str(table)
def __repr__(self):
if config.repr_is_summary:
return self.summary()
return super().__repr__()
def _repr_html_(self):
return '{}<br><i>t</i>({:.0f}) = {:.2f}; <i>p</i> {}<br>Δ<i>μ</i> ({:g}% CI) = {}, {}'.format(self._comparison_table(True), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML), (1-config.alpha)*100, self.delta.summary(), self.delta_direction)
def summary(self):
"""
Return a stringified summary of the *t* test
:rtype: str
"""
return '{}\n\nt({:.0f}) = {:.2f}; p {}\nΔμ ({:g}% CI) = {}, {}'.format(self._comparison_table(False), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION), (1-config.alpha)*100, self.delta.summary(), self.delta_direction)
def summary_short(self, html):
"""
Return a stringified summary of the *t* test (*t* statistic only)
:rtype: str
"""
if html:
return '<i>t</i>({:.0f}) = {:.2f}'.format(self.dof, self.statistic)
else:
return 't({:.0f}) = {:.2f}'.format(self.dof, self.statistic)
def ttest_ind(df, dep, ind, *, nan_policy='warn'):
"""
Perform an independent 2-sample Student's *t* test
:param df: Data to perform the test on
:type df: DataFrame
:param dep: Column in *df* for the dependent variable (numeric)
:type dep: str
:param ind: Column in *df* for the independent variable (dichotomous)
:type ind: str
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
:rtype: :class:`yli.sig_tests.TTestResult`
**Example:**
.. code-block::
df = pd.DataFrame({
'Type': ['Fresh'] * 10 + ['Stored'] * 10,
'Potency': [10.2, 10.5, 10.3, ...]
})
yli.ttest_ind(df, 'Potency', 'Type')
.. code-block:: text
Type Fresh Stored
Potency
μ (SD) 10.37 (0.32) 9.83 (0.24)
t(18) = 4.24; p < 0.001*
Δμ (95% CI) = 0.54 (0.27–0.81), Fresh > Stored
The output states that the value of the *t* statistic is 4.24, the *t* distribution has 18 degrees of freedom, and the test is significant with *p* value < 0.001.
The mean difference is 0.54 in favour of the *Fresh* group, with 95% confidence interval 0.27–0.81.
"""
# Check for/clean NaNs
df = check_nan(df[[ind, dep]], nan_policy)
# Ensure 2 groups for ind
group1, data1, group2, data2 = as_2groups(df, dep, ind)
# Do t test
# Use statsmodels rather than SciPy because this provides the mean difference automatically
d1 = sm.stats.DescrStatsW(data1, ddof=1)
d2 = sm.stats.DescrStatsW(data2, ddof=1)
cm = sm.stats.CompareMeans(d1, d2)
statistic, pvalue, dof = cm.ttest_ind()
delta = d1.mean - d2.mean
ci0, ci1 = cm.tconfint_diff(config.alpha)
# t test is symmetric so take absolute value
if d2.mean > d1.mean:
delta, ci0, ci1 = -delta, -ci1, -ci0
d1, d2 = d2, d1
group1, group2 = group2, group1
# Now group1 > group2
return TTestResult(
statistic=abs(statistic), dof=dof, pvalue=pvalue,
dep=dep, ind=ind,
group1=group1, group2=group2, mu1=d1.mean, mu2=d2.mean, sd1=d1.std, sd2=d2.std,
delta=Estimate(delta, ci0, ci1),
delta_direction='{} > {}'.format(group1, group2))
class MultipleTTestResult:
"""
Result of multiple Student's *t* tests, adjusted for multiplicity
See :func:`yli.ttest_ind_multiple`.
"""
def __init__(self, *, dep, results):
#: Name of the dependent variable (*str*)
self.dep = dep
#: Results of the *t* tests (*List[*\ :class:`TTestResult`\ *]*)
self.results = results
def _comparison_table(self, html):
"""Return a table showing the means/SDs for each group"""
group1 = self.results[0].group1
group2 = self.results[0].group2
# TODO: Render HTML directly so can have proper HTML p values
table_data = []
for row in self.results:
cell1 = '{:.2f} ({:.2f})'.format(row.mu1, row.sd1)
cell2 = '{:.2f} ({:.2f})'.format(row.mu2, row.sd2)
cell_pvalue = fmt_p(row.pvalue, PValueStyle.TABULAR)
# Display the cells the right way around
if row.group1 == group1 and row.group2 == group2:
table_data.append([cell1, cell2, cell_pvalue])
elif row.group1 == group2 and row.group2 == group1:
table_data.append([cell2, cell1, cell_pvalue])
else:
raise Exception('t tests have different groups')
if html:
table = pd.DataFrame(table_data, index=pd.Index([row.ind for row in self.results], name='\ue000 (SD)'), columns=pd.Index([self.results[0].group1, self.results[0].group2, '\ue001'], name=self.dep)) # U+E000 is in Private Use Area, mark μ symbol
table_str = table._repr_html_()
return table_str.replace('\ue000', '<i>μ</i>').replace('\ue001', '<i>p</i>')
else:
table = pd.DataFrame(table_data, index=pd.Index([row.ind for row in self.results], name='μ (SD)'), columns=pd.Index([self.results[0].group1, self.results[0].group2, 'p'], name=self.dep))
return str(table)
def __repr__(self):
if config.repr_is_summary:
return self.summary()
return super().__repr__()
def _repr_html_(self):
return self._comparison_table(True)
def summary(self):
"""
Return a stringified summary of the *t* tests
:rtype: str
"""
return str(self._comparison_table(False))
def ttest_ind_multiple(df, dep, inds, *, nan_policy='warn', method='hs'):
"""
Perform independent 2-sample Student's *t* tests with multiple independent variables, adjusting for multiplicity
:param df: Data to perform the test on
:type df: DataFrame
:param dep: Column in *df* for the dependent variable (numeric)
:type dep: str
:param ind: Columns in *df* for the independent variables (dichotomous)
:type ind: List[str]
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
:param method: Method to apply for multiplicity adjustment (see `statsmodels multipletests <https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html>`_)
:type method: str
:rtype: :class:`yli.sig_tests.MultipleTTestResult`
"""
# TODO: Unit testing
# FIXME: Assert groups of independent variables have same levels
# Perform t tests
results = []
for ind in inds:
results.append(ttest_ind(df, dep, ind, nan_policy=nan_policy))
# Adjust for multiplicity
_, pvalues_corrected, _, _ = sm.stats.multipletests([result.pvalue for result in results], alpha=config.alpha, method=method)
for result, pvalue_corrected in zip(results, pvalues_corrected):
result.pvalue = pvalue_corrected
return MultipleTTestResult(dep=dep, results=results)
# -------------
# One-way ANOVA
class FTestResult:
"""
Result of an *F* test for ANOVA/regression
See :func:`yli.anova_oneway` and :meth:`yli.regress.RegressionModel.ftest`.
"""
def __init__(self, statistic, dof1, dof2, pvalue):
#: *F* statistic (*float*)
self.statistic = statistic
#: Degrees of freedom in the *F* distribution numerator (*int*)
self.dof1 = dof1
#: Degrees of freedom in the *F* distribution denominator (*int*)
self.dof2 = dof2
#: *p* value for the *F* statistic (*float*)
self.pvalue = pvalue
def __repr__(self):
if config.repr_is_summary:
return self.summary()
return super().__repr__()
def _repr_html_(self):
return '<i>F</i>({:.0f}, {:.0f}) = {:.2f}; <i>p</i> {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
def summary(self):
"""
Return a stringified summary of the *F* test
:rtype: str
"""
return 'F({:.0f}, {:.0f}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION))
def anova_oneway(df, dep, ind, *, nan_policy='warn'):
"""
Perform one-way ANOVA
:param df: Data to perform the test on
:type df: DataFrame
:param dep: Column in *df* for the dependent variable (numeric)
:type dep: str
:param ind: Column in *df* for the independent variable (categorical)
:type ind: str
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
:rtype: :class:`yli.sig_tests.FTestResult`
**Example:**
.. code-block::
df = pd.DataFrame({
'Method': [1]*8 + [2]*7 + [3]*9,
'Score': [96, 79, 91, ...]
})
yli.anova_oneway(df, 'Score', 'Method')
.. code-block:: text
F(2, 21) = 29.57; p < 0.001*
The output states that the value of the *F* statistic is 29.57, the *F* distribution has 2 degrees of freedom in the numerator and 21 in the denominator, and the test is significant with *p* value < 0.001.
"""
# Check for/clean NaNs
df = check_nan(df[[ind, dep]], nan_policy)
# Group by independent variable
groups = df.groupby(ind)[dep]
# Perform one-way ANOVA
result = stats.f_oneway(*[groups.get_group(k) for k in groups.groups])
# See stats.f_oneway implementation
dfbn = len(groups.groups) - 1
dfwn = len(df) - len(groups.groups)
return FTestResult(result.statistic, dfbn, dfwn, result.pvalue)
# -----------------
# Mann-Whitney test
class MannWhitneyResult:
"""
Result of a Mann–Whitney *U* test
See :func:`yli.mannwhitney`.
"""
def __init__(self, *, statistic, pvalue, dep, ind, group1, group2, med1, med2, iqr1, iqr2, range1, range2, rank_biserial, direction, brunnermunzel=None):
#: Lesser of the two Mann–Whitney *U* statistics (*float*)
self.statistic = statistic
#: *p* value for the *U* statistic (*float*)
self.pvalue = pvalue
#: Name of the dependent variable (*str*)
self.dep = dep
#: Name of the independent variable (*str*)
self.ind = ind
#: Name of the first group (*str*)
self.group1 = group1
#: Name of the second group (*str*)
self.group2 = group2
#: Median of the first group (*float*)
self.med1 = med1
#: Median of the second group (*float*)
self.med2 = med2
#: Interquartile range of the first group (:class:`yli.utils.Interval`)
self.iqr1 = iqr1
#: Interquartile range of the second group (:class:`yli.utils.Interval`)
self.iqr2 = iqr2
#: Range of the first group (:class:`yli.utils.Interval`)
self.range1 = range2
#: Range of the second group (:class:`yli.utils.Interval`)
self.range2 = range2
#: Absolute value of the rank-biserial correlation (*float*)
self.rank_biserial = rank_biserial
#: Description of the direction of the effect (*str*)
self.direction = direction
#: :class:`BrunnerMunzelResult` on the same data, or *None* if N/A
self.brunnermunzel = brunnermunzel
def _comparison_table(self, html):
"""Return a table showing the medians/IQRs/ranges for each group"""
table_data = [
['{:.2f} ({})'.format(self.med1, self.iqr1.summary()), '{:.2f} ({})'.format(self.med2, self.iqr2.summary())],
['{:.2f} ({})'.format(self.med1, self.range1.summary()), '{:.2f} ({})'.format(self.med2, self.range2.summary())],
]
table = pd.DataFrame(table_data, index=pd.Index(['Median (IQR)', 'Median (range)'], name=self.dep), columns=pd.Index([self.group1, self.group2], name=self.ind))
if html:
return table._repr_html_()
else:
return str(table)
def __repr__(self):
if config.repr_is_summary:
return self.summary()
return super().__repr__()
def _repr_html_(self):
line1 = '{}<br><i>U</i> = {:.1f}; <i>p</i> {}<br><i>r</i> = {:.2f}, {}'.format(self._comparison_table(True), self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML), self.rank_biserial, self.direction)
if self.brunnermunzel:
return line1 + '<br>' + self.brunnermunzel._repr_html_()
else:
return line1
def summary(self):
"""
Return a stringified summary of the Mann–Whitney test
:rtype: str
"""
line1 = '{}\n\nU = {:.1f}; p {}\nr = {:.2f}, {}'.format(self._comparison_table(False), self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION), self.rank_biserial, self.direction)
if self.brunnermunzel:
return line1 + '\n' + self.brunnermunzel.summary()
else:
return line1
def summary_short(self, html):
"""
Return a stringified summary of the Mann–Whitney test (*U* statistic only)
:rtype: str
"""
if html:
return '<i>U</i> = {:.1f}'.format(self.statistic)
else:
return 'U = {:.1f}'.format(self.statistic)
class BrunnerMunzelResult:
"""
Result of a Brunner–Munzel test
See :func:`yli.mannwhitney`. This library calls the Brunner–Munzel test statistic *W*.
"""
"""Result of a Brunner-Munzel test"""
def __init__(self, statistic, pvalue):
#: *W* statistic (*float*)
self.statistic = statistic
#: *p* value for the *W* statistic (*float*)
self.pvalue = pvalue
def __repr__(self):
if config.repr_is_summary:
return self.summary()
return super().__repr__()
def _repr_html_(self):
return '<i>W</i> = {:.1f}; <i>p</i> {}'.format(self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
def summary(self):
"""
Return a stringified summary of the Brunner–Munzel test
:rtype: str
"""
return 'W = {:.1f}; p {}'.format(self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION))
def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_continuity=False, alternative='two-sided', method='auto'):
"""
Perform a Mann–Whitney *U* test
By default, this function performs a Brunner–Munzel test if the Mann–Whitney test is significant.
If the Mann–Whitney test is significant but the Brunner–Munzel test is not, a warning is raised.
The Brunner–Munzel test is returned only if non-significant.
:param df: Data to perform the test on
:type df: DataFrame
:param dep: Column in *df* for the dependent variable (numeric)
:type dep: str
:param ind: Column in *df* for the independent variable (dichotomous)
:type ind: str
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
:param brunnermunzel: Whether to compute the Brunner–Munzel test if the Mann–Whitney test is significant
:type brunnermunzel: bool
:param use_continuity: See *scipy.stats.mannwhitneyu*
:param alternative: See *scipy.stats.mannwhitneyu*
:param method: See *scipy.stats.mannwhitneyu*
:return: The result of the Mann–Whitney test. The result of a Brunner–Munzel test is included in the result object if and only if *brunnermunzel* is *True*, *and* the Mann–Whitney test is significant, *and* the Brunner–Munzel test is non-significant.
:rtype: :class:`yli.sig_tests.MannWhitneyResult`
**Example:**
.. code-block::
df = pd.DataFrame({
'Sample': ['Before'] * 12 + ['After'] * 12,
'Oxygen': [11.0, 11.2, 11.2, ...]
})
yli.mannwhitney(df, 'Oxygen', 'Sample', method='asymptotic', alternative='less')
.. code-block:: text
Sample After Before
Oxygen
Median (IQR) 10.75 (10.55–10.95) 11.55 (11.20–11.83)
Median (range) 10.75 (11.00–12.10) 11.55 (11.00–12.10)
U = 6.0; p < 0.001*
r = 0.92, Before > After
The output states that the value of the Mann–Whitney *U* statistic is 6.0, and the one-sided test is significant with asymptotic *p* value < 0.001.
The rank-biserial correlation is 0.92 in favour of the *Before* group.
"""
# Check for/clean NaNs
df = check_nan(df[[ind, dep]], nan_policy)
# Convert pandas nullable types for independent variables as this breaks mannwhitneyu
df = convert_pandas_nullable(df)
# Ensure 2 groups for ind
group1, data1, group2, data2 = as_2groups(df, dep, ind)
# Ensure numeric, factorising if required
data1, _ = as_numeric(data1)
data2, _ = as_numeric(data2)
# Do Mann-Whitney test
# Stata does not perform continuity correction
result = stats.mannwhitneyu(data1, data2, use_continuity=use_continuity, alternative=alternative, method=method)
u1 = result.statistic
u2 = len(data1) * len(data2) - u1
r = abs(2*u1 / (len(data1) * len(data2)) - 1) # rank-biserial
# If significant, perform a Brunner-Munzel test for our interest
if result.pvalue < 0.05 and brunnermunzel:
result_bm = stats.brunnermunzel(data1, data2)
if result_bm.pvalue >= 0.05:
warnings.warn('Mann-Whitney test is significant but Brunner-Munzel test is not. This could be due to a difference in shape, rather than location.')
return MannWhitneyResult(
statistic=min(u1, u2), pvalue=result.pvalue,
#med1=data1.median(), med2=data2.median(),
rank_biserial=r, direction=('{1} > {0}' if u1 < u2 else '{0} > {1}').format(group1, group2),
brunnermunzel=BrunnerMunzelResult(statistic=result_bm.statistic, pvalue=result_bm.pvalue))
return MannWhitneyResult(
statistic=min(u1, u2), pvalue=result.pvalue,
dep=dep, ind=ind,
group1=group1, group2=group2,
med1=data1.median(), med2=data2.median(), iqr1=Interval(data1.quantile(0.25), data1.quantile(0.75)), iqr2=Interval(data2.quantile(0.25), data2.quantile(0.75)), range1=Interval(data1.min(), data1.max()), range2=Interval(data2.min(), data2.max()),
rank_biserial=r, direction=('{1} > {0}' if u1 < u2 else '{0} > {1}').format(group1, group2))
# ------------------------
# Pearson chi-squared test
class ChiSquaredResult:
"""
Result of a generic test with *χ*:sup:`2`-distributed test statistic
See :meth:`yli.logrank`, :meth:`yli.regress.RegressionModel.deviance_chi2`.
See also :class:`yli.regress.BrantResult`, :class:`yli.regress.LikelihoodRatioTestResult`, :class:`PearsonChiSquaredResult`.
"""
def __init__(self, statistic, dof, pvalue):
#: *χ*:sup:`2` statistic (*float*)
self.statistic = statistic
#: Degrees of freedom for the *χ*:sup:`2` distribution (*int*)
self.dof = dof
#: *p* value for the *χ*:sup:`2` test (*float*)
self.pvalue = pvalue
def __repr__(self):
if config.repr_is_summary:
return self.summary()
return super().__repr__()
def _repr_html_(self):
return '<i>χ</i><sup>2</sup>({}) = {:.2f}; <i>p</i> {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
def summary(self):
"""
Return a stringified summary of the *χ*:sup:`2` test
:rtype: str
"""
return 'χ²({}) = {:.2f}; p {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION))
class PearsonChiSquaredResult(ChiSquaredResult):
"""
Result of a Pearson *χ*:sup:`2` test
See :func:`yli.chi2`.
"""
def __init__(self, ct, statistic, dof, pvalue, oddsratio=None, riskratio=None):
super().__init__(statistic, dof, pvalue)
#: Contingency table for the observations (*DataFrame*)
self.ct = ct
#: Odds ratio (*float*; *None* if not a 2×2 table)
self.oddsratio = oddsratio
#: Risk ratio (*float*; *None* if not a 2×2 table)
self.riskratio = riskratio
def _repr_html_(self):
if self.oddsratio is not None:
return '{0}<br><i>χ</i><sup>2</sup>({1}) = {2:.2f}; <i>p</i> {3}<br>OR ({4:g}% CI) = {5}<br>RR ({4:g}% CI) = {6}'.format(
self.ct._repr_html_(), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML), (1-config.alpha)*100, self.oddsratio.summary(), self.riskratio.summary())
else:
return '{}<br><i>χ</i><sup>2</sup>({}) = {:.2f}; <i>p</i> {}'.format(
self.ct._repr_html_(), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
def summary(self):
"""
Return a stringified summary of the *χ*:sup:`2` test
:rtype: str
"""
if self.oddsratio is not None:
return '{0}\n\nχ²({1}) = {2:.2f}; p {3}\nOR ({4:g}% CI) = {5}\nRR ({4:g}% CI) = {6}'.format(
self.ct, self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION), (1-config.alpha)*100, self.oddsratio.summary(), self.riskratio.summary())
else:
return '{}\n\nχ²({}) = {:.2f}; p {}'.format(
self.ct, self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION))
def summary_short(self, html):
"""
Return a stringified summary of the *χ*:sup:`2` test (*χ*:sup:`2` statistic only)
:rtype: str
"""
if html:
return '<i>χ</i><sup>2</sup>({}) = {:.2f}'.format(self.dof, self.statistic)
else:
return 'χ²({}) = {:.2f}'.format(self.dof, self.statistic)
def chi2(df, dep, ind, *, nan_policy='warn'):
"""
Perform a Pearson *χ*:sup:`2` test
If a 2×2 contingency table is obtained (i.e. if both variables are dichotomous), an odds ratio and risk ratio are calculated.
The ratios are calculated for the higher-valued value in each variable (i.e. *True* compared with *False* for a boolean).
The risk ratio is calculated relative to the independent variable (rows of the contingency table).
:param df: Data to perform the test on
:type df: DataFrame
:param dep: Column in *df* for the dependent variable (categorical)
:type dep: str
:param ind: Column in *df* for the independent variable (categorical)
:type ind: str
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
:rtype: :class:`yli.sig_tests.PearsonChiSquaredResult`
**Example:**
.. code-block::
df = pd.DataFrame({
'Response': np.repeat([False, True, False, True], [250, 750, 400, 1600]),
'Stress': np.repeat([False, False, True, True], [250, 750, 400, 1600])
})
yli.chi2(df, 'Stress', 'Response')
.. code-block:: text
Stress False True
Response
False 250 400
True 750 1600
χ²(1) = 9.82; p = 0.002*
OR (95% CI) = 1.33 (1.11–1.60)
RR (95% CI) = 1.11 (1.03–1.18)
The output shows the contingency table, and states that the value of the Pearson *χ*:sup:`2` statistic is 9.82, the *χ*:sup:`2` distribution has 1 degree of freedom, and the test is significant with *p* value 0.002.
The odds of *Stress* in the *Response* = *True* group are 1.33 times that in the *Response* = *False* group, with 95% confidence interval 1.11–1.60.
The risk of *Stress* in the *Response* = *True* group is 1.11 times that in the *Response* = *False* group, with 95% confidence interval 1.03–1.18.
"""
# Check for/clean NaNs
df = check_nan(df[[ind, dep]], nan_policy)
# Compute contingency table
ct = pd.crosstab(df[ind], df[dep])
# Get expected counts
expected = stats.contingency.expected_freq(ct)
# Warn on low expected counts
if (expected < 5).sum() / expected.size > 0.2:
warnings.warn('{} of {} cells ({:.0f}%) have expected count < 5'.format((expected < 5).sum(), expected.size, (expected < 5).sum() / expected.size * 100))
if (expected < 1).any():
warnings.warn('{} cells have expected count < 1'.format((expected < 1).sum()))
if ct.shape == (2,2):
# 2x2 table
# Use statsmodels to get OR and RR
smct = sm.stats.Table2x2(np.flip(ct.to_numpy()), shift_zeros=False)
result = smct.test_nominal_association()
ORci = smct.oddsratio_confint(config.alpha)
RRci = smct.riskratio_confint(config.alpha)
return PearsonChiSquaredResult(
ct=ct, statistic=result.statistic, dof=result.df, pvalue=result.pvalue,
oddsratio=Estimate(smct.oddsratio, ORci[0], ORci[1]), riskratio=Estimate(smct.riskratio, RRci[0], RRci[1]))
else:
# rxc table
# Just use SciPy
result = stats.chi2_contingency(ct, correction=False)
return PearsonChiSquaredResult(ct=ct, statistic=result[0], dof=result[2], pvalue=result[1])
# -------------------
# Pearson correlation
class PearsonRResult:
"""
Result of Pearson product-moment correlation
See :func:`yli.pearsonr`.
"""
def __init__(self, statistic, pvalue):
#: Pearson *r* correlation statistic (:class:`yli.utils.Estimate`)
self.statistic = statistic
#: *p* value for the *r* statistic (*float*)
self.pvalue = pvalue
def __repr__(self):
if config.repr_is_summary:
return self.summary()
return super().__repr__()
def _repr_html_(self):
return '<i>r</i> ({:g}% CI) = {}; <i>p</i> {}'.format((1-config.alpha)*100, self.statistic.summary(), fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
def summary(self):
"""
Return a stringified summary of the Pearson correlation
:rtype: str
"""
return 'r ({:g}% CI) = {}; p {}'.format((1-config.alpha)*100, self.statistic.summary(), fmt_p(self.pvalue, PValueStyle.RELATION))
def pearsonr(df, dep, ind, *, nan_policy='warn'):
"""
Compute the Pearson product-moment correlation coefficient (Pearson's *r*)
:param df: Data to perform the test on
:type df: DataFrame
:param dep: Column in *df* for the dependent variable (numerical)
:type dep: str
:param ind: Column in *df* for the independent variable (numerical)
:type ind: str
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
:rtype: :class:`yli.sig_tests.PearsonRResult`
**Example:**
.. code-block::
df = pd.DataFrame({
'y': [41, 39, 47, 51, 43, 40, 57, 46, 50, 59, 61, 52],
'x': [24, 30, 33, 35, 36, 36, 37, 37, 38, 40, 43, 49]
})
yli.pearsonr(df, 'y', 'x')
.. code-block:: text
r (95% CI) = 0.65 (0.11–0.89); p = 0.02*
The output states that the value of the Pearson correlation coefficient is 0.65, with 95% confidence interval 0.11–0.89, and the test is significant with *p* value 0.02.
"""
# Check for/clean NaNs
df = check_nan(df[[ind, dep]], nan_policy)
# Compute Pearson's r
result = stats.pearsonr(df[ind], df[dep])
ci = result.confidence_interval()
return PearsonRResult(statistic=Estimate(result.statistic, ci.low, ci.high), pvalue=result.pvalue)
# --------------------
# Spearman correlation
class SpearmanResult:
"""
Result of Spearman rank correlation
See :func:`yli.spearman`.
"""
def __init__(self, statistic, pvalue):
#: Spearman *ρ* correlation statistic (:class:`yli.utils.Estimate`)
self.statistic = statistic
#: *p* value for the *ρ* statistic (*float*)
self.pvalue = pvalue
def __repr__(self):
if config.repr_is_summary:
return self.summary()
return super().__repr__()
def _repr_html_(self):
return '<i>ρ</i> ({:g}% CI) = {}; <i>p</i> {}'.format((1-config.alpha)*100, self.statistic.summary(), fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
def summary(self):
"""
Return a stringified summary of the Spearman correlation
:rtype: str
"""
return 'ρ ({:g}% CI) = {}; p {}'.format((1-config.alpha)*100, self.statistic.summary(), fmt_p(self.pvalue, PValueStyle.RELATION))
def spearman(df, dep, ind, *, nan_policy='warn'):
"""
Compute the Spearman rank correlation coefficient (Spearman's *ρ*)
The confidence interval for *ρ* is computed analogously to SciPy's *pearsonr*, using the Fisher transformation and normal approximation, without adjustment to variance.
:param df: Data to perform the test on
:type df: DataFrame
:param dep: Column in *df* for the dependent variable (numerical)
:type dep: str
:param ind: Column in *df* for the independent variable (numerical)
:type ind: str
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
:rtype: :class:`yli.sig_tests.SpearmanResult`
**Example:**
.. code-block::
df = pd.DataFrame({
'Profit': [2.5, 6.2, 3.1, ...],
'Quality': [50, 57, 61, ...]
})
yli.spearman(df, 'Profit', 'Quality')
.. code-block:: text
ρ (95% CI) = 0.87 (0.60–0.96); p < 0.001*
The output states that the value of the Spearman correlation coefficient is 0.87, with 95% confidence interval 0.60–0.96, and the test is significant with *p* value < 0.001.
"""
# Check for/clean NaNs
df = check_nan(df[[ind, dep]], nan_policy)
# Ensure numeric, factorising categorical variables as required
ind, _ = as_numeric(df[ind])
dep, _ = as_numeric(df[dep])
# Compute Spearman's rho
result = stats.spearmanr(ind, dep)
# Compute confidence interval
ci = stats._stats_py._pearsonr_fisher_ci(result.correlation, len(dep), 1 - config.alpha, 'two-sided')
return SpearmanResult(statistic=Estimate(result.correlation, ci.low, ci.high), pvalue=result.pvalue)
# ----------------------------
# Automatic selection of tests
class AutoBinaryResult:
"""
Result of automatically computed univariable tests of association for a dichotomous dependent variable
See :func:`yli.auto_univariable`.
Results data stored within instances of this class is not intended to be directly accessed.
"""
def __init__(self, *, dep, group1, group2, result_data, result_labels):
#: Name of the dependent variable (*str*)
self.dep = dep
#: Name of the first group (*str*)
self.group1 = group1
#: Name of the second group (*str*)
self.group2 = group2
# List of tuples (first group summary, second group summary, test result)
self._result_data = result_data
# List of tuples (plaintext label, HTML label)
self._result_labels = result_labels
def __repr__(self):
if config.repr_is_summary:
return self.summary()
return super().__repr__()
def _repr_html_(self):
result = '<table><thead><tr><th>{}</th><th>{}</th><th>{}</th><th></th><th style="text-align:center"><i>p</i></th></tr></thead><tbody>'.format(self.dep, self.group1, self.group2)
for data, label in zip(self._result_data, self._result_labels):
result += '<tr><th>{}</th><td>{}</td><td>{}</td><td style="text-align:left">{}</td><td style="text-align:left">{}</td></tr>'.format(label[1], data[0], data[1], data[2].summary_short(True), fmt_p(data[2].pvalue, PValueStyle.TABULAR | PValueStyle.HTML))
result += '</tbody></table>'
return result
def summary(self):
"""
Return a stringified summary of the tests of association
:rtype: str
"""
# Format data for output
result_data_fmt = [
[r[0], r[1], r[2].summary_short(False), fmt_p(r[2].pvalue, PValueStyle.TABULAR)]
for r in self._result_data
]
result_labels_fmt = [r[0] for r in self._result_labels]
table = pd.DataFrame(result_data_fmt, index=result_labels_fmt, columns=pd.Index([self.group1, self.group2, '', 'p'], name=self.dep))
return str(table)
def auto_univariable(df, dep, inds, *, nan_policy='warn'):
"""
Automatically compute univariable tests of association for a dichotomous dependent variable
The tests performed are:
* For a dichotomous independent variable – :func:`yli.chi2`
* For a continuous independent variable – :func:`yli.ttest_ind`
* For an ordinal independent variable – :func:`yli.mannwhitney`
If *nan_policy* is *warn* or *omit*, rows with *nan* values are omitted only from the individual tests of association for the missing variables.
:param df: Data to perform the test on
:type df: DataFrame
:param dep: Column in *df* for the dependent variable (dichotomous)
:type dep: str
:param inds: Columns in *df* for the independent variables
:type inds: List[str]
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
:rtype: :class:`yli.sig_tests.AutoBinaryResult`
"""
# Check for/clean NaNs in dependent variable
df = check_nan(df[inds + [dep]], nan_policy, cols=[dep])
# Ensure 2 groups for dep
# TODO: Work for non-binary dependent variables?
group1, data1, group2, data2 = as_2groups(df, inds, dep)
result_data = []
result_labels = []
for ind in inds:
# Check for/clean NaNs in independent variable
# Following this, we pass nan_policy='raise' to assert no NaNs remaining
df_cleaned = check_nan(df, nan_policy, cols=[ind])
if df_cleaned[ind].dtype == 'category' and df_cleaned[ind].cat.ordered and df_cleaned[ind].cat.categories.dtype in ('float64', 'int64', 'Float64', 'Int64'):
# Ordinal numeric data
# Mann-Whitney test
result = mannwhitney(df_cleaned, ind, dep, nan_policy='raise')
result_labels.append((
'{}, median (IQR)'.format(ind),
'{}, median (IQR)'.format(ind),
))
result_data.append((
'{:.2f} ({})'.format(result.med1, result.iqr1.summary()),
'{:.2f} ({})'.format(result.med2, result.iqr2.summary()),
result
))
elif df_cleaned[ind].dtype in ('bool', 'boolean', 'category', 'object'):
# Categorical data
# Pearson chi-squared test
result = chi2(df_cleaned, dep, ind, nan_policy='raise')
values = sorted(df_cleaned[ind].unique())
# Value counts
result_labels.append((
'{}, {}'.format(ind, ':'.join(str(v) for v in values)),
'{}, {}'.format(ind, ':'.join(str(v) for v in values)),
))
result_data.append((
':'.join(str((data1[ind] == v).sum()) for v in values),
':'.join(str((data2[ind] == v).sum()) for v in values),
result
))
elif df_cleaned[ind].dtype in ('float64', 'int64', 'Float64', 'Int64'):
# Continuous data
# t test
result = ttest_ind(df_cleaned, ind, dep, nan_policy='raise')
result_labels.append((
'{}, μ (SD)'.format(ind),
'{}, <i>μ</i> (SD)'.format(ind),
))
result_data.append((
'{:.2f} ({:.2f})'.format(result.mu1, result.sd1),
'{:.2f} ({:.2f})'.format(result.mu2, result.sd2),
result
))
else:
raise Exception('Unsupported independent dtype for auto_univariable, {}'.format(df[ind].dtype))
return AutoBinaryResult(dep=dep, group1=group1, group2=group2, result_data=result_data, result_labels=result_labels)