2022-10-11 22:52:42 +11:00
|
|
|
# scipy-yli: Helpful SciPy utilities and recipes
|
|
|
|
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
|
|
|
|
#
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU Affero General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU Affero General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU Affero General Public License
|
|
|
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
|
2022-10-13 13:25:24 +11:00
|
|
|
import numpy as np
|
2022-10-11 22:52:42 +11:00
|
|
|
import pandas as pd
|
|
|
|
from scipy import stats
|
|
|
|
import statsmodels.api as sm
|
|
|
|
|
|
|
|
import functools
|
|
|
|
import warnings
|
|
|
|
|
2022-10-14 14:48:26 +11:00
|
|
|
from .utils import Estimate, as_2groups, check_nan, fmt_p
|
2022-10-11 22:52:42 +11:00
|
|
|
|
2022-10-13 12:53:18 +11:00
|
|
|
# ----------------
|
|
|
|
# Student's t test
|
2022-10-11 22:52:42 +11:00
|
|
|
|
|
|
|
class TTestResult:
|
|
|
|
"""
|
|
|
|
Result of a Student's t test
|
|
|
|
|
|
|
|
delta: Mean difference
|
|
|
|
"""
|
|
|
|
|
2022-10-13 12:53:18 +11:00
|
|
|
def __init__(self, statistic, dof, pvalue, delta, delta_direction):
|
2022-10-11 22:52:42 +11:00
|
|
|
self.statistic = statistic
|
|
|
|
self.dof = dof
|
|
|
|
self.pvalue = pvalue
|
|
|
|
self.delta = delta
|
2022-10-13 12:53:18 +11:00
|
|
|
self.delta_direction = delta_direction
|
2022-10-11 22:52:42 +11:00
|
|
|
|
|
|
|
def _repr_html_(self):
|
2022-10-14 14:48:26 +11:00
|
|
|
return '<i>t</i>({:.0f}) = {:.2f}; <i>p</i> {}<br><i>δ</i> (95% CI) = {}, {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, html=True), self.delta.summary(), self.delta_direction)
|
2022-10-11 22:52:42 +11:00
|
|
|
|
|
|
|
def summary(self):
|
2022-10-14 14:48:26 +11:00
|
|
|
return 't({:.0f}) = {:.2f}; p {}\nδ (95% CI) = {}, {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, html=False), self.delta.summary(), self.delta_direction)
|
2022-10-11 22:52:42 +11:00
|
|
|
|
|
|
|
def ttest_ind(df, dep, ind, *, nan_policy='warn'):
|
|
|
|
"""Perform an independent-sample Student's t test"""
|
|
|
|
|
2022-10-13 12:53:18 +11:00
|
|
|
# Check for/clean NaNs
|
2022-10-11 22:52:42 +11:00
|
|
|
df = check_nan(df[[ind, dep]], nan_policy)
|
|
|
|
|
2022-10-13 12:53:18 +11:00
|
|
|
# Ensure 2 groups for ind
|
|
|
|
group1, data1, group2, data2 = as_2groups(df, dep, ind)
|
2022-10-11 22:52:42 +11:00
|
|
|
|
|
|
|
# Do t test
|
|
|
|
# Use statsmodels rather than SciPy because this provides the mean difference automatically
|
2022-10-13 12:53:18 +11:00
|
|
|
d1 = sm.stats.DescrStatsW(data1)
|
|
|
|
d2 = sm.stats.DescrStatsW(data2)
|
2022-10-11 22:52:42 +11:00
|
|
|
|
2022-10-13 12:53:18 +11:00
|
|
|
cm = sm.stats.CompareMeans(d1, d2)
|
2022-10-11 22:52:42 +11:00
|
|
|
statistic, pvalue, dof = cm.ttest_ind()
|
|
|
|
|
2022-10-13 12:53:18 +11:00
|
|
|
delta = d1.mean - d2.mean
|
2022-10-11 22:52:42 +11:00
|
|
|
ci0, ci1 = cm.tconfint_diff()
|
|
|
|
|
2022-10-13 12:53:18 +11:00
|
|
|
# t test is symmetric so take absolute values
|
|
|
|
return TTestResult(
|
|
|
|
statistic=abs(statistic), dof=dof, pvalue=pvalue,
|
|
|
|
delta=abs(Estimate(delta, ci0, ci1)),
|
|
|
|
delta_direction=('{0} > {1}' if d1.mean > d2.mean else '{1} > {0}').format(group1, group2))
|
2022-10-13 13:14:51 +11:00
|
|
|
|
|
|
|
# -----------------
|
|
|
|
# Mann-Whitney test
|
|
|
|
|
|
|
|
class MannWhitneyResult:
|
|
|
|
"""
|
|
|
|
Result of a Mann-Whitney test
|
|
|
|
|
|
|
|
brunnermunzel: BrunnerMunzelResult on same data
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, statistic, pvalue, rank_biserial, direction, brunnermunzel=None):
|
|
|
|
self.statistic = statistic
|
|
|
|
self.pvalue = pvalue
|
|
|
|
self.rank_biserial = rank_biserial
|
|
|
|
self.direction = direction
|
|
|
|
self.brunnermunzel = brunnermunzel
|
|
|
|
|
|
|
|
def _repr_html_(self):
|
2022-10-14 14:48:26 +11:00
|
|
|
line1 = '<i>U</i> = {:.1f}; <i>p</i> {}<br><i>r</i> = {:.2f}, {}'.format(self.statistic, fmt_p(self.pvalue, html=True), self.rank_biserial, self.direction)
|
2022-10-13 13:14:51 +11:00
|
|
|
if self.brunnermunzel:
|
|
|
|
return line1 + '<br>' + self.brunnermunzel._repr_html_()
|
|
|
|
else:
|
|
|
|
return line1
|
|
|
|
|
|
|
|
def summary(self):
|
2022-10-14 14:48:26 +11:00
|
|
|
line1 = 'U = {:.1f}; p {}\nr = {}, {}'.format(self.statistic, fmt_p(self.pvalue, html=False), self.rank_biserial, self.direction)
|
2022-10-13 13:14:51 +11:00
|
|
|
if self.brunnermunzel:
|
|
|
|
return line1 + '\n' + self.brunnermunzel.summary()
|
|
|
|
else:
|
|
|
|
return line1
|
|
|
|
|
|
|
|
class BrunnerMunzelResult:
|
|
|
|
"""Result of a Brunner-Munzel test"""
|
|
|
|
|
|
|
|
def __init__(self, statistic, pvalue):
|
|
|
|
self.statistic = statistic
|
|
|
|
self.pvalue = pvalue
|
|
|
|
|
|
|
|
def _repr_html_(self):
|
2022-10-14 14:48:26 +11:00
|
|
|
return '<i>W</i> = {:.1f}; <i>p</i> {}'.format(self.statistic, fmt_p(self.pvalue, html=True))
|
2022-10-13 13:14:51 +11:00
|
|
|
|
|
|
|
def summary(self):
|
2022-10-14 14:48:26 +11:00
|
|
|
return 'W = {:.1f}; p {}'.format(self.statistic, fmt_p(self.pvalue, html=False))
|
2022-10-13 13:14:51 +11:00
|
|
|
|
|
|
|
def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_continuity=False, alternative='two-sided', method='auto'):
|
|
|
|
"""
|
|
|
|
Perform a Mann-Whitney test
|
|
|
|
|
|
|
|
brunnermunzel: Set to False to skip the Brunner-Munzel test
|
|
|
|
use_continuity, alternative, method: See scipy.stats.mannwhitneyu
|
|
|
|
"""
|
|
|
|
|
|
|
|
# Check for/clean NaNs
|
|
|
|
df = check_nan(df[[ind, dep]], nan_policy)
|
|
|
|
|
|
|
|
# Ensure 2 groups for ind
|
|
|
|
group1, data1, group2, data2 = as_2groups(df, dep, ind)
|
|
|
|
|
|
|
|
# Do Mann-Whitney test
|
|
|
|
# Stata does not perform continuity correction
|
|
|
|
result = stats.mannwhitneyu(data1, data2, use_continuity=use_continuity, alternative=alternative, method=method)
|
|
|
|
|
|
|
|
u1 = result.statistic
|
|
|
|
u2 = len(data1) * len(data2) - u1
|
|
|
|
r = abs(2*u1 / (len(data1) * len(data2)) - 1) # rank-biserial
|
|
|
|
|
|
|
|
# If significant, perform a Brunner-Munzel test for our interest
|
|
|
|
if result.pvalue < 0.05 and brunnermunzel:
|
|
|
|
result_bm = stats.brunnermunzel(data1, data2)
|
|
|
|
|
|
|
|
if result_bm.pvalue >= 0.05:
|
|
|
|
warnings.warn('Mann-Whitney test is significant but Brunner-Munzel test is not. This could be due to a difference in shape, rather than location.')
|
|
|
|
|
|
|
|
return MannWhitneyResult(
|
|
|
|
statistic=min(u1, u2), pvalue=result.pvalue,
|
|
|
|
#med1=data1.median(), med2=data2.median(),
|
|
|
|
rank_biserial=r, direction=('{1} > {0}' if u1 < u2 else '{0} > {1}').format(group1, group2),
|
|
|
|
brunnermunzel=BrunnerMunzelResult(statistic=result_bm.statistic, pvalue=result_bm.pvalue))
|
|
|
|
|
|
|
|
return MannWhitneyResult(
|
|
|
|
statistic=min(u1, u2), pvalue=result.pvalue,
|
|
|
|
#med1=data1.median(), med2=data2.median(),
|
|
|
|
rank_biserial=r, direction=('{1} > {0}' if u1 < u2 else '{0} > {1}').format(group1, group2))
|
2022-10-13 13:25:24 +11:00
|
|
|
|
|
|
|
# ------------------------
|
|
|
|
# Pearson chi-squared test
|
|
|
|
|
|
|
|
class PearsonChiSquaredResult:
|
|
|
|
"""Result of a Pearson chi-squared test"""
|
|
|
|
|
|
|
|
def __init__(self, ct, statistic, dof, pvalue, oddsratio=None, riskratio=None):
|
|
|
|
self.ct = ct
|
|
|
|
self.statistic = statistic
|
|
|
|
self.dof = dof
|
|
|
|
self.pvalue = pvalue
|
|
|
|
self.oddsratio = oddsratio
|
|
|
|
self.riskratio = riskratio
|
|
|
|
|
|
|
|
def _repr_html_(self):
|
|
|
|
if self.oddsratio is not None:
|
|
|
|
return '{}<br><i>χ</i><sup>2</sup>({}) = {:.2f}; <i>p</i> {}<br>OR (95% CI) = {}<br>RR (95% CI) = {}'.format(
|
2022-10-14 14:48:26 +11:00
|
|
|
self.ct._repr_html_(), self.dof, self.statistic, fmt_p(self.pvalue, html=True), self.oddsratio.summary(), self.riskratio.summary())
|
2022-10-13 13:25:24 +11:00
|
|
|
else:
|
|
|
|
return '{}<br><i>χ</i><sup>2</sup>({}) = {:.2f}; <i>p</i> {}'.format(
|
2022-10-14 14:48:26 +11:00
|
|
|
self.ct._repr_html_(), self.dof, self.statistic, fmt_p(self.pvalue, html=True))
|
2022-10-13 13:25:24 +11:00
|
|
|
|
|
|
|
def summary(self):
|
|
|
|
if self.oddsratio is not None:
|
|
|
|
return '{}\nχ²({}) = {:.2f}; p {}\nOR (95% CI) = {}\nRR (95% CI) = {}'.format(
|
2022-10-14 14:48:26 +11:00
|
|
|
self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False), self.oddsratio.summary(), self.riskratio.summary())
|
2022-10-13 13:25:24 +11:00
|
|
|
else:
|
|
|
|
return '{}\nχ²({}) = {:.2f}; p {}'.format(
|
2022-10-14 14:48:26 +11:00
|
|
|
self.ct, self.dof, self.statistic, fmt_p(self.pvalue, html=False))
|
2022-10-13 13:25:24 +11:00
|
|
|
|
|
|
|
def chi2(df, dep, ind, *, nan_policy='warn'):
|
2022-10-14 20:08:01 +11:00
|
|
|
"""Perform a Pearson chi-squared test"""
|
2022-10-13 13:25:24 +11:00
|
|
|
|
|
|
|
# Check for/clean NaNs
|
|
|
|
df = check_nan(df[[ind, dep]], nan_policy)
|
|
|
|
|
|
|
|
# Compute contingency table
|
|
|
|
ct = pd.crosstab(df[ind], df[dep])
|
|
|
|
|
|
|
|
# Get expected counts
|
|
|
|
expected = stats.contingency.expected_freq(ct)
|
|
|
|
|
|
|
|
# Warn on low expected counts
|
|
|
|
if (expected < 5).sum() / expected.size > 0.2:
|
|
|
|
warnings.warn('{} of {} cells ({:.0f}%) have expected count < 5'.format((expected < 5).sum(), expected.size, (expected < 5).sum() / expected.size * 100))
|
|
|
|
if (expected < 1).any():
|
|
|
|
warnings.warn('{} cells have expected count < 1'.format((expected < 1).sum()))
|
|
|
|
|
|
|
|
if ct.shape == (2,2):
|
|
|
|
# 2x2 table
|
2022-10-14 20:08:01 +11:00
|
|
|
# Use statsmodels to get OR and RR
|
2022-10-13 13:25:24 +11:00
|
|
|
|
|
|
|
smct = sm.stats.Table2x2(np.flip(ct.to_numpy()), shift_zeros=False)
|
|
|
|
result = smct.test_nominal_association()
|
|
|
|
ORci = smct.oddsratio_confint()
|
|
|
|
RRci = smct.riskratio_confint()
|
|
|
|
|
|
|
|
return PearsonChiSquaredResult(
|
|
|
|
ct=ct, statistic=result.statistic, dof=result.df, pvalue=result.pvalue,
|
|
|
|
oddsratio=Estimate(smct.oddsratio, ORci[0], ORci[1]), riskratio=Estimate(smct.riskratio, RRci[0], RRci[1]))
|
|
|
|
else:
|
|
|
|
# rxc table
|
|
|
|
# Just use SciPy
|
|
|
|
|
|
|
|
result = stats.chi2_contingency(ct, correction=False)
|
|
|
|
|
|
|
|
return PearsonChiSquaredResult(ct=ct, statistic=result[0], dof=result[2], pvalue=result[1])
|