scipy-yli/yli/sig_tests.py

443 lines
14 KiB
Python
Raw Normal View History

2022-10-11 22:52:42 +11:00
# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
2022-10-13 13:25:24 +11:00
import numpy as np
2022-10-11 22:52:42 +11:00
import pandas as pd
from scipy import stats
import statsmodels.api as sm
import functools
import warnings
from .config import config
2022-10-14 14:48:26 +11:00
from .utils import Estimate, as_2groups, check_nan, fmt_p
2022-10-11 22:52:42 +11:00
2022-10-13 12:53:18 +11:00
# ----------------
# Student's t test
2022-10-11 22:52:42 +11:00
class TTestResult:
"""
2022-10-17 21:41:19 +11:00
Result of a Student's *t* test
2022-10-11 22:52:42 +11:00
2022-10-17 21:41:19 +11:00
See :func:`yli.ttest_ind`.
2022-10-11 22:52:42 +11:00
"""
2022-10-13 12:53:18 +11:00
def __init__(self, statistic, dof, pvalue, delta, delta_direction):
2022-10-17 21:41:19 +11:00
#: *t* statistic (*float*)
2022-10-11 22:52:42 +11:00
self.statistic = statistic
2022-10-17 21:41:19 +11:00
#: Degrees of freedom of the *t* distribution (*int*)
2022-10-11 22:52:42 +11:00
self.dof = dof
2022-10-17 21:41:19 +11:00
#: *p* value for the *t* statistic (*float*)
2022-10-11 22:52:42 +11:00
self.pvalue = pvalue
2022-10-17 21:41:19 +11:00
#: Absolute value of the mean difference (:class:`yli.utils.Estimate`)
2022-10-11 22:52:42 +11:00
self.delta = delta
2022-10-17 21:41:19 +11:00
#: Description of the direction of the effect (*str*)
2022-10-13 12:53:18 +11:00
self.delta_direction = delta_direction
2022-10-11 22:52:42 +11:00
def _repr_html_(self):
2022-10-16 01:44:05 +11:00
return '<i>t</i>({:.0f}) = {:.2f}; <i>p</i> {}<br>Δ<i>μ</i> ({:g}% CI) = {}, {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, html=True), (1-config.alpha)*100, self.delta.summary(), self.delta_direction)
2022-10-11 22:52:42 +11:00
def summary(self):
2022-10-17 21:41:19 +11:00
"""
Return a stringified summary of the *t* test
:rtype: str
"""
2022-10-16 01:44:05 +11:00
return 't({:.0f}) = {:.2f}; p {}\nΔμ ({:g}% CI) = {}, {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, html=False), (1-config.alpha)*100, self.delta.summary(), self.delta_direction)
2022-10-11 22:52:42 +11:00
def ttest_ind(df, dep, ind, *, nan_policy='warn'):
2022-10-17 21:41:19 +11:00
"""
Perform an independent 2-sample Student's *t* test
:param df: Data to perform the test on
:type df: DataFrame
:param dep: Column in *df* for the dependent variable (numeric)
:type dep: str
:param ind: Column in *df* for the independent variable (dichotomous)
:type ind: str
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
:rtype: :class:`yli.sig_tests.TTestResult`
"""
2022-10-11 22:52:42 +11:00
2022-10-13 12:53:18 +11:00
# Check for/clean NaNs
2022-10-11 22:52:42 +11:00
df = check_nan(df[[ind, dep]], nan_policy)
2022-10-13 12:53:18 +11:00
# Ensure 2 groups for ind
group1, data1, group2, data2 = as_2groups(df, dep, ind)
2022-10-11 22:52:42 +11:00
# Do t test
# Use statsmodels rather than SciPy because this provides the mean difference automatically
2022-10-13 12:53:18 +11:00
d1 = sm.stats.DescrStatsW(data1)
d2 = sm.stats.DescrStatsW(data2)
2022-10-11 22:52:42 +11:00
2022-10-13 12:53:18 +11:00
cm = sm.stats.CompareMeans(d1, d2)
2022-10-11 22:52:42 +11:00
statistic, pvalue, dof = cm.ttest_ind()
2022-10-13 12:53:18 +11:00
delta = d1.mean - d2.mean
ci0, ci1 = cm.tconfint_diff(config.alpha)
2022-10-11 22:52:42 +11:00
2022-10-13 12:53:18 +11:00
# t test is symmetric so take absolute values
return TTestResult(
statistic=abs(statistic), dof=dof, pvalue=pvalue,
delta=abs(Estimate(delta, ci0, ci1)),
delta_direction=('{0} > {1}' if d1.mean > d2.mean else '{1} > {0}').format(group1, group2))
2022-10-13 13:14:51 +11:00
2022-10-14 20:18:25 +11:00
# -------------
# One-way ANOVA
class FTestResult:
2022-10-17 21:41:19 +11:00
"""
Result of an *F* test for ANOVA/regression
See :func:`yli.anova_oneway` and :meth:`yli.regress.RegressionResult.ftest`.
"""
2022-10-14 20:18:25 +11:00
def __init__(self, statistic, dof1, dof2, pvalue):
2022-10-17 21:41:19 +11:00
#: *F* statistic (*float*)
2022-10-14 20:18:25 +11:00
self.statistic = statistic
2022-10-17 21:41:19 +11:00
#: Degrees of freedom in the *F* distribution numerator (*int*)
2022-10-14 20:18:25 +11:00
self.dof1 = dof1
2022-10-17 21:41:19 +11:00
#: Degrees of freedom in the *F* distribution denominator (*int*)
2022-10-14 20:18:25 +11:00
self.dof2 = dof2
2022-10-17 21:41:19 +11:00
#: *p* value for the *F* statistic (*float*)
2022-10-14 20:18:25 +11:00
self.pvalue = pvalue
def _repr_html_(self):
return '<i>F</i>({}, {}) = {:.2f}; <i>p</i> {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=True))
def summary(self):
2022-10-17 21:41:19 +11:00
"""
Return a stringified summary of the *F* test
:rtype: str
"""
2022-10-14 20:18:25 +11:00
return 'F({}, {}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=False))
def anova_oneway(df, dep, ind, *, nan_policy='omit'):
2022-10-17 21:41:19 +11:00
"""
Perform one-way ANOVA
:param df: Data to perform the test on
:type df: DataFrame
:param dep: Column in *df* for the dependent variable (numeric)
:type dep: str
:param ind: Column in *df* for the independent variable (categorical)
:type ind: str
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
:rtype: :class:`yli.sig_tests.FTestResult`
"""
2022-10-14 20:18:25 +11:00
# Check for/clean NaNs
df = check_nan(df[[ind, dep]], nan_policy)
# Group by independent variable
groups = df.groupby(ind)[dep]
# Perform one-way ANOVA
result = stats.f_oneway(*[groups.get_group(k) for k in groups.groups])
# See stats.f_oneway implementation
dfbn = len(groups.groups) - 1
dfwn = len(df) - len(groups.groups)
return FTestResult(result.statistic, dfbn, dfwn, result.pvalue)
2022-10-13 13:14:51 +11:00
# -----------------
# Mann-Whitney test
class MannWhitneyResult:
"""
2022-10-17 21:41:19 +11:00
Result of a Mann-Whitney *U* test
2022-10-13 13:14:51 +11:00
2022-10-17 21:41:19 +11:00
See :func:`yli.mannwhitney`.
2022-10-13 13:14:51 +11:00
"""
def __init__(self, statistic, pvalue, rank_biserial, direction, brunnermunzel=None):
2022-10-17 21:41:19 +11:00
#: Mann–Whitney *U* statistic (*float*)
2022-10-13 13:14:51 +11:00
self.statistic = statistic
2022-10-17 21:41:19 +11:00
#: *p* value for the *U* statistic (*float*)
2022-10-13 13:14:51 +11:00
self.pvalue = pvalue
2022-10-17 21:41:19 +11:00
#: Absolute value of the rank-biserial correlation (*float*)
2022-10-13 13:14:51 +11:00
self.rank_biserial = rank_biserial
2022-10-17 21:41:19 +11:00
#: Description of the direction of the effect (*str*)
2022-10-13 13:14:51 +11:00
self.direction = direction
2022-10-17 21:41:19 +11:00
#: :class:`BrunnerMunzelResult` on the same data, or *None* if N/A
2022-10-13 13:14:51 +11:00
self.brunnermunzel = brunnermunzel
def _repr_html_(self):
2022-10-14 14:48:26 +11:00
line1 = '<i>U</i> = {:.1f}; <i>p</i> {}<br><i>r</i> = {:.2f}, {}'.format(self.statistic, fmt_p(self.pvalue, html=True), self.rank_biserial, self.direction)
2022-10-13 13:14:51 +11:00
if self.brunnermunzel:
return line1 + '<br>' + self.brunnermunzel._repr_html_()
else:
return line1
def summary(self):
2022-10-17 21:41:19 +11:00
"""
Return a stringified summary of the MannWhitney test
:rtype: str
"""
2022-10-14 14:48:26 +11:00
line1 = 'U = {:.1f}; p {}\nr = {}, {}'.format(self.statistic, fmt_p(self.pvalue, html=False), self.rank_biserial, self.direction)
2022-10-13 13:14:51 +11:00
if self.brunnermunzel:
return line1 + '\n' + self.brunnermunzel.summary()
else:
return line1
class BrunnerMunzelResult:
2022-10-17 21:41:19 +11:00
"""
Result of a BrunnerMunzel test
See :func:`yli.mannwhitney`. This library calls the BrunnerMunzel test statistic *W*.
"""
2022-10-13 13:14:51 +11:00
"""Result of a Brunner-Munzel test"""
def __init__(self, statistic, pvalue):
2022-10-17 21:41:19 +11:00
#: *W* statistic (*float*)
2022-10-13 13:14:51 +11:00
self.statistic = statistic
2022-10-17 21:41:19 +11:00
#: *p* value for the *W* statistic (*float*)
2022-10-13 13:14:51 +11:00
self.pvalue = pvalue
def _repr_html_(self):
2022-10-14 14:48:26 +11:00
return '<i>W</i> = {:.1f}; <i>p</i> {}'.format(self.statistic, fmt_p(self.pvalue, html=True))
2022-10-13 13:14:51 +11:00
def summary(self):
2022-10-17 21:41:19 +11:00
"""
Return a stringified summary of the BrunnerMunzel test
:rtype: str
"""
2022-10-14 14:48:26 +11:00
return 'W = {:.1f}; p {}'.format(self.statistic, fmt_p(self.pvalue, html=False))
2022-10-13 13:14:51 +11:00
def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_continuity=False, alternative='two-sided', method='auto'):
"""
2022-10-17 21:41:19 +11:00
Perform a Mann-Whitney *U* test
By default, this function performs a BrunnerMunzel test if the MannWhitney test is significant.
If the MannWhitney test is significant but the BrunnerMunzel test is not, a warning is raised.
The BrunnerMunzel test is returned only if non-significant.
:param df: Data to perform the test on
:type df: DataFrame
:param dep: Column in *df* for the dependent variable (numeric)
:type dep: str
:param ind: Column in *df* for the independent variable (dichotomous)
:type ind: str
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
:param brunnermunzel: Whether to compute the BrunnerMunzel test if the MannWhitney test is significant
:type brunnermunzel: bool
:param use_continuity: See *scipy.stats.mannwhitneyu*
:param alternative: See *scipy.stats.mannwhitneyu*
:param method: See *scipy.stats.mannwhitneyu*
:rtype: :class:`yli.sig_tests.MannWhitneyResult`
2022-10-13 13:14:51 +11:00
"""
# Check for/clean NaNs
df = check_nan(df[[ind, dep]], nan_policy)
# Ensure 2 groups for ind
group1, data1, group2, data2 = as_2groups(df, dep, ind)
# Do Mann-Whitney test
# Stata does not perform continuity correction
result = stats.mannwhitneyu(data1, data2, use_continuity=use_continuity, alternative=alternative, method=method)
u1 = result.statistic
u2 = len(data1) * len(data2) - u1
r = abs(2*u1 / (len(data1) * len(data2)) - 1) # rank-biserial
# If significant, perform a Brunner-Munzel test for our interest
if result.pvalue < 0.05 and brunnermunzel:
result_bm = stats.brunnermunzel(data1, data2)
if result_bm.pvalue >= 0.05:
warnings.warn('Mann-Whitney test is significant but Brunner-Munzel test is not. This could be due to a difference in shape, rather than location.')
return MannWhitneyResult(
statistic=min(u1, u2), pvalue=result.pvalue,
#med1=data1.median(), med2=data2.median(),
rank_biserial=r, direction=('{1} > {0}' if u1 < u2 else '{0} > {1}').format(group1, group2),
brunnermunzel=BrunnerMunzelResult(statistic=result_bm.statistic, pvalue=result_bm.pvalue))
return MannWhitneyResult(
statistic=min(u1, u2), pvalue=result.pvalue,
#med1=data1.median(), med2=data2.median(),
rank_biserial=r, direction=('{1} > {0}' if u1 < u2 else '{0} > {1}').format(group1, group2))
2022-10-13 13:25:24 +11:00
# ------------------------
# Pearson chi-squared test
class PearsonChiSquaredResult:
2022-10-17 21:41:19 +11:00
"""
Result of a Pearson *χ*:sup:`2` test
See :func:`yli.chi2`.
"""
2022-10-13 13:25:24 +11:00
def __init__(self, ct, statistic, dof, pvalue, oddsratio=None, riskratio=None):
2022-10-17 21:41:19 +11:00
#: Contingency table for the observations (*DataFrame*)
2022-10-13 13:25:24 +11:00
self.ct = ct
2022-10-17 21:41:19 +11:00
#: *χ*:sup:`2` statistic (*float*)
2022-10-13 13:25:24 +11:00
self.statistic = statistic
2022-10-17 21:41:19 +11:00
#: Degrees of freedom for the *χ*:sup:`2` distribution (*int*)
2022-10-13 13:25:24 +11:00
self.dof = dof
2022-10-17 21:41:19 +11:00
#: *p* value for the *χ*:sup:`2` test (*float*)
2022-10-13 13:25:24 +11:00
self.pvalue = pvalue
2022-10-17 21:41:19 +11:00
#: Odds ratio (*float*; *None* if not a 2×2 table)
2022-10-13 13:25:24 +11:00
self.oddsratio = oddsratio
2022-10-17 21:41:19 +11:00
#: Risk ratio (*float*; *None* if not a 2×2 table)
2022-10-13 13:25:24 +11:00
self.riskratio = riskratio
def _repr_html_(self):
if self.oddsratio is not None:
return '{0}<br><i>χ</i><sup>2</sup>({1}) = {2:.2f}; <i>p</i> {3}<br>OR ({4:g}% CI) = {5}<br>RR ({4:g}% CI) = {6}'.format(
self.ct._repr_html_(), self.dof, self.statistic, fmt_p(self.pvalue, html=True), (1-config.alpha)*100, self.oddsratio.summary(), self.riskratio.summary())
2022-10-13 13:25:24 +11:00
else:
return '{}<br><i>χ</i><sup>2</sup>({}) = {:.2f}; <i>p</i> {}'.format(
2022-10-14 14:48:26 +11:00
self.ct._repr_html_(), self.dof, self.statistic, fmt_p(self.pvalue, html=True))
2022-10-13 13:25:24 +11:00
def summary(self):
2022-10-17 21:41:19 +11:00
"""
Return a stringified summary of the *χ*:sup:`2` test
:rtype: str
"""
2022-10-13 13:25:24 +11:00
if self.oddsratio is not None:
return '{0}\nχ²({1}) = {2:.2f}; p {3}\nOR ({4:g}% CI) = {5}\nRR ({4:g}% CI) = {6}'.format(
self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False), (1-config.alpha)*100, self.oddsratio.summary(), self.riskratio.summary())
2022-10-13 13:25:24 +11:00
else:
return '{}\nχ²({}) = {:.2f}; p {}'.format(
2022-10-14 14:48:26 +11:00
self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False))
2022-10-13 13:25:24 +11:00
def chi2(df, dep, ind, *, nan_policy='warn'):
2022-10-17 21:41:19 +11:00
"""
Perform a Pearson *χ*:sup:`2` test
If a 2×2 contingency table is obtained (i.e. if both variables are dichotomous), an odds ratio and risk ratio are calculated.
The ratios are calculated for the higher-valued value in each variable (i.e. ``True`` compared with ``False`` for a boolean).
The risk ratio is calculated relative to the independent variable.
:param df: Data to perform the test on
:type df: DataFrame
:param dep: Column in *df* for the dependent variable (categorical)
:type dep: str
:param ind: Column in *df* for the independent variable (categorical)
:type ind: str
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
:rtype: :class:`yli.sig_tests.PearsonChiSquaredResult`
"""
2022-10-13 13:25:24 +11:00
# Check for/clean NaNs
df = check_nan(df[[ind, dep]], nan_policy)
# Compute contingency table
ct = pd.crosstab(df[ind], df[dep])
# Get expected counts
expected = stats.contingency.expected_freq(ct)
# Warn on low expected counts
if (expected < 5).sum() / expected.size > 0.2:
warnings.warn('{} of {} cells ({:.0f}%) have expected count < 5'.format((expected < 5).sum(), expected.size, (expected < 5).sum() / expected.size * 100))
if (expected < 1).any():
warnings.warn('{} cells have expected count < 1'.format((expected < 1).sum()))
if ct.shape == (2,2):
# 2x2 table
2022-10-14 20:08:01 +11:00
# Use statsmodels to get OR and RR
2022-10-13 13:25:24 +11:00
smct = sm.stats.Table2x2(np.flip(ct.to_numpy()), shift_zeros=False)
result = smct.test_nominal_association()
ORci = smct.oddsratio_confint(config.alpha)
RRci = smct.riskratio_confint(config.alpha)
2022-10-13 13:25:24 +11:00
return PearsonChiSquaredResult(
ct=ct, statistic=result.statistic, dof=result.df, pvalue=result.pvalue,
oddsratio=Estimate(smct.oddsratio, ORci[0], ORci[1]), riskratio=Estimate(smct.riskratio, RRci[0], RRci[1]))
else:
# rxc table
# Just use SciPy
result = stats.chi2_contingency(ct, correction=False)
return PearsonChiSquaredResult(ct=ct, statistic=result[0], dof=result[2], pvalue=result[1])
2022-10-14 20:18:25 +11:00
# -------------------
# Pearson correlation
class PearsonRResult:
2022-10-17 21:41:19 +11:00
"""
Result of Pearson correlation
See :func:`yli.pearsonr`.
"""
2022-10-14 20:18:25 +11:00
def __init__(self, statistic, pvalue):
2022-10-17 21:41:19 +11:00
#: Pearson *r* correlation statistic (*float*)
2022-10-14 20:18:25 +11:00
self.statistic = statistic
2022-10-17 21:41:19 +11:00
#: *p* value for the *r* statistic (*float*)
2022-10-14 20:18:25 +11:00
self.pvalue = pvalue
def _repr_html_(self):
return '<i>r</i> ({:g}% CI) = {}; <i>p</i> {}'.format((1-config.alpha)*100, self.statistic.summary(), fmt_p(self.pvalue, html=True))
2022-10-14 20:18:25 +11:00
def summary(self):
2022-10-17 21:41:19 +11:00
"""
Return a stringified summary of the Pearson correlation
:rtype: str
"""
return 'r ({:g}% CI) = {}; p {}'.format((1-config.alpha)*100, self.statistic.summary(), fmt_p(self.pvalue, html=False))
2022-10-14 20:18:25 +11:00
def pearsonr(df, dep, ind, *, nan_policy='warn'):
2022-10-17 21:41:19 +11:00
"""
Compute the Pearson correlation coefficient (Pearson's r)
:param df: Data to perform the test on
:type df: DataFrame
:param dep: Column in *df* for the dependent variable (numerical)
:type dep: str
:param ind: Column in *df* for the independent variable (numerical)
:type ind: str
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
:rtype: :class:`yli.sig_tests.PearsonRResult`
"""
2022-10-14 20:18:25 +11:00
# Check for/clean NaNs
df = check_nan(df[[ind, dep]], nan_policy)
# Compute Pearson's r
result = stats.pearsonr(df[ind], df[dep])
ci = result.confidence_interval()
return PearsonRResult(statistic=Estimate(result.statistic, ci.low, ci.high), pvalue=result.pvalue)