scipy-yli/yli/sig_tests.py

1049 lines
35 KiB
Python
Raw Normal View History

2022-10-11 22:52:42 +11:00
# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
2022-10-11 22:52:42 +11:00
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
2022-10-13 13:25:24 +11:00
import numpy as np
2022-10-11 22:52:42 +11:00
import pandas as pd
from scipy import stats
import statsmodels.api as sm
import functools
import warnings
from .config import config
2022-12-03 20:00:29 +11:00
from .utils import Estimate, Interval, PValueStyle, as_2groups, as_numeric, check_nan, convert_pandas_nullable, fmt_p
2022-10-11 22:52:42 +11:00
2022-10-13 12:53:18 +11:00
# ----------------
# Student's t test
2022-10-11 22:52:42 +11:00
class TTestResult:
"""
2022-10-17 21:41:19 +11:00
Result of a Student's *t* test
2022-10-11 22:52:42 +11:00
2022-10-17 21:41:19 +11:00
See :func:`yli.ttest_ind`.
2022-10-11 22:52:42 +11:00
"""
def __init__(self, *, statistic, dof, pvalue, dep, ind, group1, group2, mu1, mu2, sd1, sd2, delta, delta_direction):
2022-10-17 21:41:19 +11:00
#: *t* statistic (*float*)
2022-10-11 22:52:42 +11:00
self.statistic = statistic
2022-10-17 21:41:19 +11:00
#: Degrees of freedom of the *t* distribution (*int*)
2022-10-11 22:52:42 +11:00
self.dof = dof
2022-10-17 21:41:19 +11:00
#: *p* value for the *t* statistic (*float*)
2022-10-11 22:52:42 +11:00
self.pvalue = pvalue
#: Name of the dependent variable (*str*)
self.dep = dep
#: Name of the independent variable (*str*)
self.ind = ind
2022-11-09 17:01:45 +11:00
#: Name of the first group (*str*)
self.group1 = group1
#: Name of the second group (*str*)
self.group2 = group2
#: Mean of the first group (*float*)
self.mu1 = mu1
#: Mean of the second group (*float*)
self.mu2 = mu2
#: Standard deviation of the first group (*float*)
self.sd1 = sd1
#: Standard deviation of the second group (*float*)
self.sd2 = sd2
2022-10-17 21:41:19 +11:00
#: Absolute value of the mean difference (:class:`yli.utils.Estimate`)
2022-10-11 22:52:42 +11:00
self.delta = delta
2022-10-17 21:41:19 +11:00
#: Description of the direction of the effect (*str*)
2022-10-13 12:53:18 +11:00
self.delta_direction = delta_direction
2022-10-11 22:52:42 +11:00
def _comparison_table(self, html):
"""Return a table showing the means/SDs for each group"""
table_data = [[
'{:.2f} ({:.2f})'.format(self.mu1, self.sd1),
'{:.2f} ({:.2f})'.format(self.mu2, self.sd2),
]]
if html:
table = pd.DataFrame(table_data, index=pd.Index(['\ue000 (SD)'], name=self.dep), columns=pd.Index([self.group1, self.group2], name=self.ind)) # U+E000 is in Private Use Area, mark μ symbol
table_str = table._repr_html_()
return table_str.replace('\ue000', '<i>μ</i>')
else:
table = pd.DataFrame(table_data, index=pd.Index(['μ (SD)'], name=self.dep), columns=pd.Index([self.group1, self.group2], name=self.ind))
return str(table)
def __repr__(self):
if config.repr_is_summary:
return self.summary()
return super().__repr__()
2022-10-11 22:52:42 +11:00
def _repr_html_(self):
return '{}<br><i>t</i>({:.0f}) = {:.2f}; <i>p</i> {}<br>Δ<i>μ</i> ({:g}% CI) = {}, {}'.format(self._comparison_table(True), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML), (1-config.alpha)*100, self.delta.summary(), self.delta_direction)
2022-10-11 22:52:42 +11:00
def summary(self):
2022-10-17 21:41:19 +11:00
"""
Return a stringified summary of the *t* test
:rtype: str
"""
return '{}\n\nt({:.0f}) = {:.2f}; p {}\nΔμ ({:g}% CI) = {}, {}'.format(self._comparison_table(False), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION), (1-config.alpha)*100, self.delta.summary(), self.delta_direction)
2022-11-09 23:31:27 +11:00
def summary_short(self, html):
"""
Return a stringified summary of the *t* test (*t* statistic only)
:rtype: str
"""
if html:
return '<i>t</i>({:.0f}) = {:.2f}'.format(self.dof, self.statistic)
else:
return 't({:.0f}) = {:.2f}'.format(self.dof, self.statistic)
2022-10-11 22:52:42 +11:00
def ttest_ind(df, dep, ind, *, nan_policy='warn'):
2022-10-17 21:41:19 +11:00
"""
Perform an independent 2-sample Student's *t* test
:param df: Data to perform the test on
:type df: DataFrame
:param dep: Column in *df* for the dependent variable (numeric)
:type dep: str
:param ind: Column in *df* for the independent variable (dichotomous)
:type ind: str
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
:rtype: :class:`yli.sig_tests.TTestResult`
**Example:**
.. code-block::
df = pd.DataFrame({
'Type': ['Fresh'] * 10 + ['Stored'] * 10,
'Potency': [10.2, 10.5, 10.3, ...]
})
yli.ttest_ind(df, 'Potency', 'Type')
.. code-block:: text
Type Fresh Stored
Potency
μ (SD) 10.37 (0.32) 9.83 (0.24)
t(18) = 4.24; p < 0.001*
Δμ (95% CI) = 0.54 (0.270.81), Fresh > Stored
The output states that the value of the *t* statistic is 4.24, the *t* distribution has 18 degrees of freedom, and the test is significant with *p* value < 0.001.
The mean difference is 0.54 in favour of the *Fresh* group, with 95% confidence interval 0.270.81.
2022-10-17 21:41:19 +11:00
"""
2022-10-11 22:52:42 +11:00
2022-10-13 12:53:18 +11:00
# Check for/clean NaNs
2022-10-11 22:52:42 +11:00
df = check_nan(df[[ind, dep]], nan_policy)
2022-10-13 12:53:18 +11:00
# Ensure 2 groups for ind
group1, data1, group2, data2 = as_2groups(df, dep, ind)
2022-10-11 22:52:42 +11:00
# Do t test
# Use statsmodels rather than SciPy because this provides the mean difference automatically
2022-11-09 17:01:45 +11:00
d1 = sm.stats.DescrStatsW(data1, ddof=1)
d2 = sm.stats.DescrStatsW(data2, ddof=1)
2022-10-11 22:52:42 +11:00
2022-10-13 12:53:18 +11:00
cm = sm.stats.CompareMeans(d1, d2)
2022-10-11 22:52:42 +11:00
statistic, pvalue, dof = cm.ttest_ind()
2022-10-13 12:53:18 +11:00
delta = d1.mean - d2.mean
ci0, ci1 = cm.tconfint_diff(config.alpha)
2022-10-11 22:52:42 +11:00
2022-11-09 17:01:45 +11:00
# t test is symmetric so take absolute value
if d2.mean > d1.mean:
delta, ci0, ci1 = -delta, -ci1, -ci0
d1, d2 = d2, d1
group1, group2 = group2, group1
# Now group1 > group2
2022-10-13 12:53:18 +11:00
return TTestResult(
statistic=abs(statistic), dof=dof, pvalue=pvalue,
dep=dep, ind=ind,
2022-11-09 17:01:45 +11:00
group1=group1, group2=group2, mu1=d1.mean, mu2=d2.mean, sd1=d1.std, sd2=d2.std,
delta=Estimate(delta, ci0, ci1),
delta_direction='{} > {}'.format(group1, group2))
2022-10-13 13:14:51 +11:00
2023-04-20 15:41:03 +10:00
class MultipleTTestResult:
"""
Result of multiple Student's *t* tests, adjusted for multiplicity
See :func:`yli.ttest_ind_multiple`.
"""
def __init__(self, *, dep, results):
#: Name of the dependent variable (*str*)
self.dep = dep
#: Results of the *t* tests (*List[*\ :class:`TTestResult`\ *]*)
self.results = results
def _comparison_table(self, html):
"""Return a table showing the means/SDs for each group"""
group1 = self.results[0].group1
group2 = self.results[0].group2
# TODO: Render HTML directly so can have proper HTML p values
table_data = []
for row in self.results:
cell1 = '{:.2f} ({:.2f})'.format(row.mu1, row.sd1)
cell2 = '{:.2f} ({:.2f})'.format(row.mu2, row.sd2)
cell_pvalue = fmt_p(row.pvalue, PValueStyle.TABULAR)
# Display the cells the right way around
if row.group1 == group1 and row.group2 == group2:
table_data.append([cell1, cell2, cell_pvalue])
elif row.group1 == group2 and row.group2 == group1:
table_data.append([cell2, cell1, cell_pvalue])
else:
raise Exception('t tests have different groups')
if html:
table = pd.DataFrame(table_data, index=pd.Index([row.ind for row in self.results], name='\ue000 (SD)'), columns=pd.Index([self.results[0].group1, self.results[0].group2, '\ue001'], name=self.dep)) # U+E000 is in Private Use Area, mark μ symbol
table_str = table._repr_html_()
return table_str.replace('\ue000', '<i>μ</i>').replace('\ue001', '<i>p</i>')
else:
table = pd.DataFrame(table_data, index=pd.Index([row.ind for row in self.results], name='μ (SD)'), columns=pd.Index([self.results[0].group1, self.results[0].group2, 'p'], name=self.dep))
return str(table)
def __repr__(self):
if config.repr_is_summary:
return self.summary()
return super().__repr__()
def _repr_html_(self):
return self._comparison_table(True)
def summary(self):
"""
Return a stringified summary of the *t* tests
:rtype: str
"""
return str(self._comparison_table(False))
def ttest_ind_multiple(df, dep, inds, *, nan_policy='warn', method='hs'):
"""
Perform independent 2-sample Student's *t* tests with multiple independent variables, adjusting for multiplicity
:param df: Data to perform the test on
:type df: DataFrame
:param dep: Column in *df* for the dependent variable (numeric)
:type dep: str
:param ind: Columns in *df* for the independent variables (dichotomous)
:type ind: List[str]
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
:param method: Method to apply for multiplicity adjustment (see `statsmodels multipletests <https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html>`_)
:type method: str
:rtype: :class:`yli.sig_tests.MultipleTTestResult`
"""
# TODO: Unit testing
# FIXME: Assert groups of independent variables have same levels
# Perform t tests
results = []
for ind in inds:
results.append(ttest_ind(df, dep, ind, nan_policy=nan_policy))
# Adjust for multiplicity
_, pvalues_corrected, _, _ = sm.stats.multipletests([result.pvalue for result in results], alpha=config.alpha, method=method)
for result, pvalue_corrected in zip(results, pvalues_corrected):
result.pvalue = pvalue_corrected
return MultipleTTestResult(dep=dep, results=results)
2022-10-14 20:18:25 +11:00
# -------------
# One-way ANOVA
class FTestResult:
2022-10-17 21:41:19 +11:00
"""
Result of an *F* test for ANOVA/regression
2023-04-16 23:52:12 +10:00
See :func:`yli.anova_oneway` and :meth:`yli.regress.RegressionModel.ftest`.
2022-10-17 21:41:19 +11:00
"""
2022-10-14 20:18:25 +11:00
def __init__(self, statistic, dof1, dof2, pvalue):
2022-10-17 21:41:19 +11:00
#: *F* statistic (*float*)
2022-10-14 20:18:25 +11:00
self.statistic = statistic
2022-10-17 21:41:19 +11:00
#: Degrees of freedom in the *F* distribution numerator (*int*)
2022-10-14 20:18:25 +11:00
self.dof1 = dof1
2022-10-17 21:41:19 +11:00
#: Degrees of freedom in the *F* distribution denominator (*int*)
2022-10-14 20:18:25 +11:00
self.dof2 = dof2
2022-10-17 21:41:19 +11:00
#: *p* value for the *F* statistic (*float*)
2022-10-14 20:18:25 +11:00
self.pvalue = pvalue
def __repr__(self):
if config.repr_is_summary:
return self.summary()
return super().__repr__()
2022-10-14 20:18:25 +11:00
def _repr_html_(self):
return '<i>F</i>({:.0f}, {:.0f}) = {:.2f}; <i>p</i> {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
2022-10-14 20:18:25 +11:00
def summary(self):
2022-10-17 21:41:19 +11:00
"""
Return a stringified summary of the *F* test
:rtype: str
"""
return 'F({:.0f}, {:.0f}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION))
2022-10-14 20:18:25 +11:00
def anova_oneway(df, dep, ind, *, nan_policy='warn'):
2022-10-17 21:41:19 +11:00
"""
Perform one-way ANOVA
:param df: Data to perform the test on
:type df: DataFrame
:param dep: Column in *df* for the dependent variable (numeric)
:type dep: str
:param ind: Column in *df* for the independent variable (categorical)
:type ind: str
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
:rtype: :class:`yli.sig_tests.FTestResult`
**Example:**
.. code-block::
df = pd.DataFrame({
'Method': [1]*8 + [2]*7 + [3]*9,
'Score': [96, 79, 91, ...]
})
yli.anova_oneway(df, 'Score', 'Method')
.. code-block:: text
F(2, 21) = 29.57; p < 0.001*
The output states that the value of the *F* statistic is 29.57, the *F* distribution has 2 degrees of freedom in the numerator and 21 in the denominator, and the test is significant with *p* value < 0.001.
2022-10-17 21:41:19 +11:00
"""
2022-10-14 20:18:25 +11:00
# Check for/clean NaNs
df = check_nan(df[[ind, dep]], nan_policy)
# Group by independent variable
groups = df.groupby(ind)[dep]
# Perform one-way ANOVA
result = stats.f_oneway(*[groups.get_group(k) for k in groups.groups])
# See stats.f_oneway implementation
dfbn = len(groups.groups) - 1
dfwn = len(df) - len(groups.groups)
return FTestResult(result.statistic, dfbn, dfwn, result.pvalue)
2022-10-13 13:14:51 +11:00
# -----------------
# Mann-Whitney test
class MannWhitneyResult:
"""
Result of a MannWhitney *U* test
2022-10-13 13:14:51 +11:00
2022-10-17 21:41:19 +11:00
See :func:`yli.mannwhitney`.
2022-10-13 13:14:51 +11:00
"""
def __init__(self, *, statistic, pvalue, dep, ind, group1, group2, med1, med2, iqr1, iqr2, range1, range2, rank_biserial, direction, brunnermunzel=None):
#: Lesser of the two Mann–Whitney *U* statistics (*float*)
2022-10-13 13:14:51 +11:00
self.statistic = statistic
2022-10-17 21:41:19 +11:00
#: *p* value for the *U* statistic (*float*)
2022-10-13 13:14:51 +11:00
self.pvalue = pvalue
#: Name of the dependent variable (*str*)
self.dep = dep
#: Name of the independent variable (*str*)
self.ind = ind
#: Name of the first group (*str*)
self.group1 = group1
#: Name of the second group (*str*)
self.group2 = group2
#: Median of the first group (*float*)
self.med1 = med1
#: Median of the second group (*float*)
self.med2 = med2
2022-11-09 22:10:15 +11:00
#: Interquartile range of the first group (:class:`yli.utils.Interval`)
self.iqr1 = iqr1
2022-11-09 22:10:15 +11:00
#: Interquartile range of the second group (:class:`yli.utils.Interval`)
self.iqr2 = iqr2
2022-11-09 22:10:15 +11:00
#: Range of the first group (:class:`yli.utils.Interval`)
self.range1 = range2
2022-11-09 22:10:15 +11:00
#: Range of the second group (:class:`yli.utils.Interval`)
self.range2 = range2
2022-10-17 21:41:19 +11:00
#: Absolute value of the rank-biserial correlation (*float*)
2022-10-13 13:14:51 +11:00
self.rank_biserial = rank_biserial
2022-10-17 21:41:19 +11:00
#: Description of the direction of the effect (*str*)
2022-10-13 13:14:51 +11:00
self.direction = direction
2022-10-17 21:41:19 +11:00
#: :class:`BrunnerMunzelResult` on the same data, or *None* if N/A
2022-10-13 13:14:51 +11:00
self.brunnermunzel = brunnermunzel
def _comparison_table(self, html):
"""Return a table showing the medians/IQRs/ranges for each group"""
table_data = [
['{:.2f} ({})'.format(self.med1, self.iqr1.summary()), '{:.2f} ({})'.format(self.med2, self.iqr2.summary())],
['{:.2f} ({})'.format(self.med1, self.range1.summary()), '{:.2f} ({})'.format(self.med2, self.range2.summary())],
]
table = pd.DataFrame(table_data, index=pd.Index(['Median (IQR)', 'Median (range)'], name=self.dep), columns=pd.Index([self.group1, self.group2], name=self.ind))
if html:
return table._repr_html_()
else:
return str(table)
def __repr__(self):
if config.repr_is_summary:
return self.summary()
return super().__repr__()
2022-10-13 13:14:51 +11:00
def _repr_html_(self):
line1 = '{}<br><i>U</i> = {:.1f}; <i>p</i> {}<br><i>r</i> = {:.2f}, {}'.format(self._comparison_table(True), self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML), self.rank_biserial, self.direction)
2022-10-13 13:14:51 +11:00
if self.brunnermunzel:
return line1 + '<br>' + self.brunnermunzel._repr_html_()
else:
return line1
def summary(self):
2022-10-17 21:41:19 +11:00
"""
Return a stringified summary of the MannWhitney test
:rtype: str
"""
line1 = '{}\n\nU = {:.1f}; p {}\nr = {:.2f}, {}'.format(self._comparison_table(False), self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION), self.rank_biserial, self.direction)
2022-10-13 13:14:51 +11:00
if self.brunnermunzel:
return line1 + '\n' + self.brunnermunzel.summary()
else:
return line1
2022-11-09 23:31:27 +11:00
def summary_short(self, html):
"""
Return a stringified summary of the MannWhitney test (*U* statistic only)
:rtype: str
"""
if html:
return '<i>U</i> = {:.1f}'.format(self.statistic)
else:
return 'U = {:.1f}'.format(self.statistic)
2022-10-13 13:14:51 +11:00
class BrunnerMunzelResult:
2022-10-17 21:41:19 +11:00
"""
Result of a BrunnerMunzel test
See :func:`yli.mannwhitney`. This library calls the BrunnerMunzel test statistic *W*.
"""
2022-10-13 13:14:51 +11:00
"""Result of a Brunner-Munzel test"""
def __init__(self, statistic, pvalue):
2022-10-17 21:41:19 +11:00
#: *W* statistic (*float*)
2022-10-13 13:14:51 +11:00
self.statistic = statistic
2022-10-17 21:41:19 +11:00
#: *p* value for the *W* statistic (*float*)
2022-10-13 13:14:51 +11:00
self.pvalue = pvalue
def __repr__(self):
if config.repr_is_summary:
return self.summary()
return super().__repr__()
2022-10-13 13:14:51 +11:00
def _repr_html_(self):
return '<i>W</i> = {:.1f}; <i>p</i> {}'.format(self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
2022-10-13 13:14:51 +11:00
def summary(self):
2022-10-17 21:41:19 +11:00
"""
Return a stringified summary of the BrunnerMunzel test
:rtype: str
"""
return 'W = {:.1f}; p {}'.format(self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION))
2022-10-13 13:14:51 +11:00
def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_continuity=False, alternative='two-sided', method='auto'):
"""
Perform a MannWhitney *U* test
2022-10-17 21:41:19 +11:00
By default, this function performs a BrunnerMunzel test if the MannWhitney test is significant.
If the MannWhitney test is significant but the BrunnerMunzel test is not, a warning is raised.
The BrunnerMunzel test is returned only if non-significant.
:param df: Data to perform the test on
:type df: DataFrame
:param dep: Column in *df* for the dependent variable (numeric)
:type dep: str
:param ind: Column in *df* for the independent variable (dichotomous)
:type ind: str
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
:param brunnermunzel: Whether to compute the BrunnerMunzel test if the MannWhitney test is significant
:type brunnermunzel: bool
:param use_continuity: See *scipy.stats.mannwhitneyu*
:param alternative: See *scipy.stats.mannwhitneyu*
:param method: See *scipy.stats.mannwhitneyu*
2022-11-09 17:05:04 +11:00
:return: The result of the MannWhitney test. The result of a BrunnerMunzel test is included in the result object if and only if *brunnermunzel* is *True*, *and* the MannWhitney test is significant, *and* the BrunnerMunzel test is non-significant.
2022-10-17 21:41:19 +11:00
:rtype: :class:`yli.sig_tests.MannWhitneyResult`
**Example:**
.. code-block::
df = pd.DataFrame({
'Sample': ['Before'] * 12 + ['After'] * 12,
'Oxygen': [11.0, 11.2, 11.2, ...]
})
yli.mannwhitney(df, 'Oxygen', 'Sample', method='asymptotic', alternative='less')
.. code-block:: text
Sample After Before
Oxygen
Median (IQR) 10.75 (10.5510.95) 11.55 (11.2011.83)
Median (range) 10.75 (11.0012.10) 11.55 (11.0012.10)
U = 6.0; p < 0.001*
r = 0.92, Before > After
The output states that the value of the MannWhitney *U* statistic is 6.0, and the one-sided test is significant with asymptotic *p* value < 0.001.
The rank-biserial correlation is 0.92 in favour of the *Before* group.
2022-10-13 13:14:51 +11:00
"""
# Check for/clean NaNs
df = check_nan(df[[ind, dep]], nan_policy)
2022-11-09 23:31:27 +11:00
# Convert pandas nullable types for independent variables as this breaks mannwhitneyu
df = convert_pandas_nullable(df)
2022-10-13 13:14:51 +11:00
# Ensure 2 groups for ind
group1, data1, group2, data2 = as_2groups(df, dep, ind)
# Ensure numeric, factorising if required
data1, _ = as_numeric(data1)
data2, _ = as_numeric(data2)
2022-10-13 13:14:51 +11:00
# Do Mann-Whitney test
# Stata does not perform continuity correction
result = stats.mannwhitneyu(data1, data2, use_continuity=use_continuity, alternative=alternative, method=method)
u1 = result.statistic
u2 = len(data1) * len(data2) - u1
r = abs(2*u1 / (len(data1) * len(data2)) - 1) # rank-biserial
# If significant, perform a Brunner-Munzel test for our interest
if result.pvalue < 0.05 and brunnermunzel:
result_bm = stats.brunnermunzel(data1, data2)
if result_bm.pvalue >= 0.05:
warnings.warn('Mann-Whitney test is significant but Brunner-Munzel test is not. This could be due to a difference in shape, rather than location.')
return MannWhitneyResult(
statistic=min(u1, u2), pvalue=result.pvalue,
#med1=data1.median(), med2=data2.median(),
rank_biserial=r, direction=('{1} > {0}' if u1 < u2 else '{0} > {1}').format(group1, group2),
brunnermunzel=BrunnerMunzelResult(statistic=result_bm.statistic, pvalue=result_bm.pvalue))
return MannWhitneyResult(
statistic=min(u1, u2), pvalue=result.pvalue,
dep=dep, ind=ind,
group1=group1, group2=group2,
2022-11-09 22:10:15 +11:00
med1=data1.median(), med2=data2.median(), iqr1=Interval(data1.quantile(0.25), data1.quantile(0.75)), iqr2=Interval(data2.quantile(0.25), data2.quantile(0.75)), range1=Interval(data1.min(), data1.max()), range2=Interval(data2.min(), data2.max()),
2022-10-13 13:14:51 +11:00
rank_biserial=r, direction=('{1} > {0}' if u1 < u2 else '{0} > {1}').format(group1, group2))
2022-10-13 13:25:24 +11:00
# ------------------------
# Pearson chi-squared test
2022-12-02 20:20:25 +11:00
class ChiSquaredResult:
"""
Result of a generic test with *χ*:sup:`2`-distributed test statistic
See :meth:`yli.logrank`.
See also :class:`yli.regress.BrantResult`, :class:`yli.regress.LikelihoodRatioTestResult`, :class:`PearsonChiSquaredResult`.
"""
2022-12-02 20:20:25 +11:00
def __init__(self, statistic, dof, pvalue):
#: *χ*:sup:`2` statistic (*float*)
self.statistic = statistic
#: Degrees of freedom for the *χ*:sup:`2` distribution (*int*)
self.dof = dof
#: *p* value for the *χ*:sup:`2` test (*float*)
self.pvalue = pvalue
def __repr__(self):
if config.repr_is_summary:
return self.summary()
return super().__repr__()
def _repr_html_(self):
return '<i>χ</i><sup>2</sup>({}) = {:.2f}; <i>p</i> {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
def summary(self):
"""
Return a stringified summary of the *χ*:sup:`2` test
:rtype: str
"""
2023-02-25 17:23:20 +11:00
return 'χ²({}) = {:.2f}; p {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION))
2022-12-02 20:20:25 +11:00
class PearsonChiSquaredResult(ChiSquaredResult):
2022-10-17 21:41:19 +11:00
"""
Result of a Pearson *χ*:sup:`2` test
See :func:`yli.chi2`.
"""
2022-10-13 13:25:24 +11:00
def __init__(self, ct, statistic, dof, pvalue, oddsratio=None, riskratio=None):
2022-12-02 20:20:25 +11:00
super().__init__(statistic, dof, pvalue)
2022-10-17 21:41:19 +11:00
#: Contingency table for the observations (*DataFrame*)
2022-10-13 13:25:24 +11:00
self.ct = ct
2022-10-17 21:41:19 +11:00
#: Odds ratio (*float*; *None* if not a 2×2 table)
2022-10-13 13:25:24 +11:00
self.oddsratio = oddsratio
2022-10-17 21:41:19 +11:00
#: Risk ratio (*float*; *None* if not a 2×2 table)
2022-10-13 13:25:24 +11:00
self.riskratio = riskratio
def _repr_html_(self):
if self.oddsratio is not None:
return '{0}<br><i>χ</i><sup>2</sup>({1}) = {2:.2f}; <i>p</i> {3}<br>OR ({4:g}% CI) = {5}<br>RR ({4:g}% CI) = {6}'.format(
self.ct._repr_html_(), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML), (1-config.alpha)*100, self.oddsratio.summary(), self.riskratio.summary())
2022-10-13 13:25:24 +11:00
else:
return '{}<br><i>χ</i><sup>2</sup>({}) = {:.2f}; <i>p</i> {}'.format(
self.ct._repr_html_(), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
2022-10-13 13:25:24 +11:00
def summary(self):
2022-10-17 21:41:19 +11:00
"""
Return a stringified summary of the *χ*:sup:`2` test
:rtype: str
"""
2022-10-13 13:25:24 +11:00
if self.oddsratio is not None:
return '{0}\n\nχ²({1}) = {2:.2f}; p {3}\nOR ({4:g}% CI) = {5}\nRR ({4:g}% CI) = {6}'.format(
self.ct, self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION), (1-config.alpha)*100, self.oddsratio.summary(), self.riskratio.summary())
2022-10-13 13:25:24 +11:00
else:
return '{}\n\nχ²({}) = {:.2f}; p {}'.format(
self.ct, self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION))
2022-11-09 23:31:27 +11:00
def summary_short(self, html):
"""
Return a stringified summary of the *χ*:sup:`2` test (*χ*:sup:`2` statistic only)
:rtype: str
"""
if html:
return '<i>χ</i><sup>2</sup>({}) = {:.2f}'.format(self.dof, self.statistic)
else:
return 'χ²({}) = {:.2f}'.format(self.dof, self.statistic)
2022-10-13 13:25:24 +11:00
def chi2(df, dep, ind, *, nan_policy='warn'):
2022-10-17 21:41:19 +11:00
"""
Perform a Pearson *χ*:sup:`2` test
If a 2×2 contingency table is obtained (i.e. if both variables are dichotomous), an odds ratio and risk ratio are calculated.
The ratios are calculated for the higher-valued value in each variable (i.e. *True* compared with *False* for a boolean).
The risk ratio is calculated relative to the independent variable (rows of the contingency table).
2022-10-17 21:41:19 +11:00
:param df: Data to perform the test on
:type df: DataFrame
:param dep: Column in *df* for the dependent variable (categorical)
:type dep: str
:param ind: Column in *df* for the independent variable (categorical)
:type ind: str
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
:rtype: :class:`yli.sig_tests.PearsonChiSquaredResult`
**Example:**
.. code-block::
df = pd.DataFrame({
'Response': np.repeat([False, True, False, True], [250, 750, 400, 1600]),
'Stress': np.repeat([False, False, True, True], [250, 750, 400, 1600])
})
yli.chi2(df, 'Stress', 'Response')
.. code-block:: text
Stress False True
Response
False 250 400
True 750 1600
χ²(1) = 9.82; p = 0.002*
OR (95% CI) = 1.33 (1.111.60)
RR (95% CI) = 1.11 (1.031.18)
The output shows the contingency table, and states that the value of the Pearson *χ*:sup:`2` statistic is 9.82, the *χ*:sup:`2` distribution has 1 degree of freedom, and the test is significant with *p* value 0.002.
The odds of *Stress* in the *Response* = *True* group are 1.33 times that in the *Response* = *False* group, with 95% confidence interval 1.111.60.
2022-12-03 20:01:05 +11:00
The risk of *Stress* in the *Response* = *True* group is 1.11 times that in the *Response* = *False* group, with 95% confidence interval 1.031.18.
2022-10-17 21:41:19 +11:00
"""
2022-10-13 13:25:24 +11:00
# Check for/clean NaNs
df = check_nan(df[[ind, dep]], nan_policy)
# Compute contingency table
ct = pd.crosstab(df[ind], df[dep])
# Get expected counts
expected = stats.contingency.expected_freq(ct)
# Warn on low expected counts
if (expected < 5).sum() / expected.size > 0.2:
warnings.warn('{} of {} cells ({:.0f}%) have expected count < 5'.format((expected < 5).sum(), expected.size, (expected < 5).sum() / expected.size * 100))
if (expected < 1).any():
warnings.warn('{} cells have expected count < 1'.format((expected < 1).sum()))
if ct.shape == (2,2):
# 2x2 table
2022-10-14 20:08:01 +11:00
# Use statsmodels to get OR and RR
2022-10-13 13:25:24 +11:00
smct = sm.stats.Table2x2(np.flip(ct.to_numpy()), shift_zeros=False)
result = smct.test_nominal_association()
ORci = smct.oddsratio_confint(config.alpha)
RRci = smct.riskratio_confint(config.alpha)
2022-10-13 13:25:24 +11:00
return PearsonChiSquaredResult(
ct=ct, statistic=result.statistic, dof=result.df, pvalue=result.pvalue,
oddsratio=Estimate(smct.oddsratio, ORci[0], ORci[1]), riskratio=Estimate(smct.riskratio, RRci[0], RRci[1]))
else:
# rxc table
# Just use SciPy
result = stats.chi2_contingency(ct, correction=False)
return PearsonChiSquaredResult(ct=ct, statistic=result[0], dof=result[2], pvalue=result[1])
2022-10-14 20:18:25 +11:00
# -------------------
# Pearson correlation
class PearsonRResult:
2022-10-17 21:41:19 +11:00
"""
2022-12-03 20:01:05 +11:00
Result of Pearson product-moment correlation
2022-10-17 21:41:19 +11:00
See :func:`yli.pearsonr`.
"""
2022-10-14 20:18:25 +11:00
def __init__(self, statistic, pvalue):
2022-12-03 20:01:05 +11:00
#: Pearson *r* correlation statistic (:class:`yli.utils.Estimate`)
2022-10-14 20:18:25 +11:00
self.statistic = statistic
2022-10-17 21:41:19 +11:00
#: *p* value for the *r* statistic (*float*)
2022-10-14 20:18:25 +11:00
self.pvalue = pvalue
def __repr__(self):
if config.repr_is_summary:
return self.summary()
return super().__repr__()
2022-10-14 20:18:25 +11:00
def _repr_html_(self):
return '<i>r</i> ({:g}% CI) = {}; <i>p</i> {}'.format((1-config.alpha)*100, self.statistic.summary(), fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
2022-10-14 20:18:25 +11:00
def summary(self):
2022-10-17 21:41:19 +11:00
"""
Return a stringified summary of the Pearson correlation
:rtype: str
"""
return 'r ({:g}% CI) = {}; p {}'.format((1-config.alpha)*100, self.statistic.summary(), fmt_p(self.pvalue, PValueStyle.RELATION))
2022-10-14 20:18:25 +11:00
def pearsonr(df, dep, ind, *, nan_policy='warn'):
2022-10-17 21:41:19 +11:00
"""
2022-12-03 20:01:05 +11:00
Compute the Pearson product-moment correlation coefficient (Pearson's *r*)
2022-10-17 21:41:19 +11:00
:param df: Data to perform the test on
:type df: DataFrame
:param dep: Column in *df* for the dependent variable (numerical)
:type dep: str
:param ind: Column in *df* for the independent variable (numerical)
:type ind: str
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
:rtype: :class:`yli.sig_tests.PearsonRResult`
2022-10-20 20:57:57 +11:00
**Example:**
.. code-block::
df = pd.DataFrame({
'y': [41, 39, 47, 51, 43, 40, 57, 46, 50, 59, 61, 52],
'x': [24, 30, 33, 35, 36, 36, 37, 37, 38, 40, 43, 49]
})
yli.pearsonr(df, 'y', 'x')
.. code-block:: text
r (95% CI) = 0.65 (0.110.89); p = 0.02*
The output states that the value of the Pearson correlation coefficient is 0.65, with 95% confidence interval 0.110.89, and the test is significant with *p* value 0.02.
2022-10-17 21:41:19 +11:00
"""
2022-10-14 20:18:25 +11:00
# Check for/clean NaNs
df = check_nan(df[[ind, dep]], nan_policy)
# Compute Pearson's r
result = stats.pearsonr(df[ind], df[dep])
ci = result.confidence_interval()
return PearsonRResult(statistic=Estimate(result.statistic, ci.low, ci.high), pvalue=result.pvalue)
2022-12-03 20:00:29 +11:00
# --------------------
# Spearman correlation
class SpearmanResult:
2022-12-03 20:01:05 +11:00
"""
Result of Spearman rank correlation
See :func:`yli.spearman`.
"""
2022-12-03 20:00:29 +11:00
def __init__(self, statistic, pvalue):
2022-12-03 20:01:05 +11:00
#: Spearman *ρ* correlation statistic (:class:`yli.utils.Estimate`)
2022-12-03 20:00:29 +11:00
self.statistic = statistic
2022-12-03 20:01:05 +11:00
#: *p* value for the *ρ* statistic (*float*)
2022-12-03 20:00:29 +11:00
self.pvalue = pvalue
def __repr__(self):
if config.repr_is_summary:
return self.summary()
return super().__repr__()
def _repr_html_(self):
return '<i>ρ</i> ({:g}% CI) = {}; <i>p</i> {}'.format((1-config.alpha)*100, self.statistic.summary(), fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
def summary(self):
"""
Return a stringified summary of the Spearman correlation
:rtype: str
"""
return 'ρ ({:g}% CI) = {}; p {}'.format((1-config.alpha)*100, self.statistic.summary(), fmt_p(self.pvalue, PValueStyle.RELATION))
def spearman(df, dep, ind, *, nan_policy='warn'):
2022-12-03 20:01:05 +11:00
"""
Compute the Spearman rank correlation coefficient (Spearman's *ρ*)
The confidence interval for *ρ* is computed analogously to SciPy's *pearsonr*, using the Fisher transformation and normal approximation, without adjustment to variance.
:param df: Data to perform the test on
:type df: DataFrame
:param dep: Column in *df* for the dependent variable (numerical)
:type dep: str
:param ind: Column in *df* for the independent variable (numerical)
:type ind: str
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
:rtype: :class:`yli.sig_tests.SpearmanResult`
**Example:**
.. code-block::
df = pd.DataFrame({
'Profit': [2.5, 6.2, 3.1, ...],
'Quality': [50, 57, 61, ...]
})
yli.spearman(df, 'Profit', 'Quality')
.. code-block:: text
ρ (95% CI) = 0.87 (0.600.96); p < 0.001*
The output states that the value of the Spearman correlation coefficient is 0.87, with 95% confidence interval 0.600.96, and the test is significant with *p* value < 0.001.
"""
2022-12-03 20:00:29 +11:00
# Check for/clean NaNs
df = check_nan(df[[ind, dep]], nan_policy)
# Ensure numeric, factorising categorical variables as required
ind, _ = as_numeric(df[ind])
dep, _ = as_numeric(df[dep])
# Compute Spearman's rho
result = stats.spearmanr(ind, dep)
# Compute confidence interval
ci = stats._stats_py._pearsonr_fisher_ci(result.correlation, len(dep), 1 - config.alpha, 'two-sided')
return SpearmanResult(statistic=Estimate(result.correlation, ci.low, ci.high), pvalue=result.pvalue)
2022-11-09 23:31:27 +11:00
# ----------------------------
# Automatic selection of tests
class AutoBinaryResult:
"""
Result of automatically computed univariable tests of association for a dichotomous dependent variable
See :func:`yli.auto_univariable`.
Results data stored within instances of this class is not intended to be directly accessed.
"""
def __init__(self, *, dep, group1, group2, result_data, result_labels):
#: Name of the dependent variable (*str*)
self.dep = dep
#: Name of the first group (*str*)
self.group1 = group1
#: Name of the second group (*str*)
self.group2 = group2
# List of tuples (first group summary, second group summary, test result)
self._result_data = result_data
2022-11-10 21:20:06 +11:00
# List of tuples (plaintext label, HTML label)
2022-11-09 23:31:27 +11:00
self._result_labels = result_labels
def __repr__(self):
if config.repr_is_summary:
return self.summary()
return super().__repr__()
def _repr_html_(self):
result = '<table><thead><tr><th>{}</th><th>{}</th><th>{}</th><th></th><th style="text-align:center"><i>p</i></th></tr></thead><tbody>'.format(self.dep, self.group1, self.group2)
for data, label in zip(self._result_data, self._result_labels):
result += '<tr><th>{}</th><td>{}</td><td>{}</td><td style="text-align:left">{}</td><td style="text-align:left">{}</td></tr>'.format(label[1], data[0], data[1], data[2].summary_short(True), fmt_p(data[2].pvalue, PValueStyle.TABULAR | PValueStyle.HTML))
result += '</tbody></table>'
return result
def summary(self):
"""
Return a stringified summary of the tests of association
:rtype: str
"""
# Format data for output
result_data_fmt = [
[r[0], r[1], r[2].summary_short(False), fmt_p(r[2].pvalue, PValueStyle.TABULAR)]
for r in self._result_data
]
result_labels_fmt = [r[0] for r in self._result_labels]
table = pd.DataFrame(result_data_fmt, index=result_labels_fmt, columns=pd.Index([self.group1, self.group2, '', 'p'], name=self.dep))
return str(table)
def auto_univariable(df, dep, inds, *, nan_policy='warn'):
2022-11-09 23:31:27 +11:00
"""
Automatically compute univariable tests of association for a dichotomous dependent variable
The tests performed are:
* For a dichotomous independent variable :func:`yli.chi2`
* For a continuous independent variable :func:`yli.ttest_ind`
* For an ordinal independent variable :func:`yli.mannwhitney`
If *nan_policy* is *warn* or *omit*, rows with *nan* values are omitted only from the individual tests of association for the missing variables.
2022-11-09 23:31:27 +11:00
:param df: Data to perform the test on
:type df: DataFrame
:param dep: Column in *df* for the dependent variable (dichotomous)
:type dep: str
:param inds: Columns in *df* for the independent variables
:type inds: List[str]
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
:rtype: :class:`yli.sig_tests.AutoBinaryResult`
"""
# Check for/clean NaNs in dependent variable
df = check_nan(df[inds + [dep]], nan_policy, cols=[dep])
2022-11-09 23:31:27 +11:00
# Ensure 2 groups for dep
# TODO: Work for non-binary dependent variables?
group1, data1, group2, data2 = as_2groups(df, inds, dep)
result_data = []
result_labels = []
for ind in inds:
# Check for/clean NaNs in independent variable
# Following this, we pass nan_policy='raise' to assert no NaNs remaining
df_cleaned = check_nan(df, nan_policy, cols=[ind])
if df_cleaned[ind].dtype == 'category' and df_cleaned[ind].cat.ordered and df_cleaned[ind].cat.categories.dtype in ('float64', 'int64', 'Float64', 'Int64'):
# Ordinal numeric data
# Mann-Whitney test
result = mannwhitney(df_cleaned, ind, dep, nan_policy='raise')
result_labels.append((
'{}, median (IQR)'.format(ind),
'{}, median (IQR)'.format(ind),
))
result_data.append((
'{:.2f} ({})'.format(result.med1, result.iqr1.summary()),
'{:.2f} ({})'.format(result.med2, result.iqr2.summary()),
result
))
elif df_cleaned[ind].dtype in ('bool', 'boolean', 'category', 'object'):
# Categorical data
2022-11-09 23:31:27 +11:00
# Pearson chi-squared test
result = chi2(df_cleaned, dep, ind, nan_policy='raise')
2022-11-09 23:31:27 +11:00
values = sorted(df_cleaned[ind].unique())
2022-11-09 23:31:27 +11:00
# Value counts
result_labels.append((
'{}, {}'.format(ind, ':'.join(str(v) for v in values)),
'{}, {}'.format(ind, ':'.join(str(v) for v in values)),
))
result_data.append((
':'.join(str((data1[ind] == v).sum()) for v in values),
':'.join(str((data2[ind] == v).sum()) for v in values),
result
))
elif df_cleaned[ind].dtype in ('float64', 'int64', 'Float64', 'Int64'):
# Continuous data
# t test
result = ttest_ind(df_cleaned, ind, dep, nan_policy='raise')
result_labels.append((
'{}, μ (SD)'.format(ind),
'{}, <i>μ</i> (SD)'.format(ind),
))
result_data.append((
'{:.2f} ({:.2f})'.format(result.mu1, result.sd1),
'{:.2f} ({:.2f})'.format(result.mu2, result.sd2),
result
))
2022-11-09 23:31:27 +11:00
else:
raise Exception('Unsupported independent dtype for auto_univariable, {}'.format(df[ind].dtype))
return AutoBinaryResult(dep=dep, group1=group1, group2=group2, result_data=result_data, result_labels=result_labels)