2022-10-11 22:52:42 +11:00
|
|
|
# scipy-yli: Helpful SciPy utilities and recipes
|
|
|
|
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
|
|
|
|
#
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU Affero General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU Affero General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU Affero General Public License
|
|
|
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
|
2022-10-13 13:25:24 +11:00
|
|
|
import numpy as np
|
2022-10-11 22:52:42 +11:00
|
|
|
import pandas as pd
|
|
|
|
from scipy import stats
|
|
|
|
import statsmodels.api as sm
|
|
|
|
|
|
|
|
import functools
|
|
|
|
import warnings
|
|
|
|
|
2022-10-15 23:24:29 +11:00
|
|
|
from .config import config
|
2022-11-09 22:08:27 +11:00
|
|
|
from .utils import ConfidenceInterval, Estimate, PValueStyle, as_2groups, check_nan, convert_pandas_nullable, fmt_p
|
2022-10-11 22:52:42 +11:00
|
|
|
|
2022-10-13 12:53:18 +11:00
|
|
|
# ----------------
|
|
|
|
# Student's t test
|
2022-10-11 22:52:42 +11:00
|
|
|
|
|
|
|
class TTestResult:
|
|
|
|
"""
|
2022-10-17 21:41:19 +11:00
|
|
|
Result of a Student's *t* test
|
2022-10-11 22:52:42 +11:00
|
|
|
|
2022-10-17 21:41:19 +11:00
|
|
|
See :func:`yli.ttest_ind`.
|
2022-10-11 22:52:42 +11:00
|
|
|
"""
|
|
|
|
|
2022-11-09 17:01:45 +11:00
|
|
|
def __init__(self, statistic, dof, pvalue, group1, group2, mu1, mu2, sd1, sd2, delta, delta_direction):
|
2022-10-17 21:41:19 +11:00
|
|
|
#: *t* statistic (*float*)
|
2022-10-11 22:52:42 +11:00
|
|
|
self.statistic = statistic
|
2022-10-17 21:41:19 +11:00
|
|
|
#: Degrees of freedom of the *t* distribution (*int*)
|
2022-10-11 22:52:42 +11:00
|
|
|
self.dof = dof
|
2022-10-17 21:41:19 +11:00
|
|
|
#: *p* value for the *t* statistic (*float*)
|
2022-10-11 22:52:42 +11:00
|
|
|
self.pvalue = pvalue
|
2022-11-09 22:08:27 +11:00
|
|
|
|
2022-11-09 17:01:45 +11:00
|
|
|
#: Name of the first group (*str*)
|
|
|
|
self.group1 = group1
|
|
|
|
#: Name of the second group (*str*)
|
|
|
|
self.group2 = group2
|
|
|
|
#: Mean of the first group (*float*)
|
|
|
|
self.mu1 = mu1
|
|
|
|
#: Mean of the second group (*float*)
|
|
|
|
self.mu2 = mu2
|
|
|
|
#: Standard deviation of the first group (*float*)
|
|
|
|
self.sd1 = sd1
|
|
|
|
#: Standard deviation of the second group (*float*)
|
|
|
|
self.sd2 = sd2
|
2022-11-09 22:08:27 +11:00
|
|
|
|
2022-10-17 21:41:19 +11:00
|
|
|
#: Absolute value of the mean difference (:class:`yli.utils.Estimate`)
|
2022-10-11 22:52:42 +11:00
|
|
|
self.delta = delta
|
2022-10-17 21:41:19 +11:00
|
|
|
#: Description of the direction of the effect (*str*)
|
2022-10-13 12:53:18 +11:00
|
|
|
self.delta_direction = delta_direction
|
2022-10-11 22:52:42 +11:00
|
|
|
|
2022-11-09 18:18:47 +11:00
|
|
|
def _comparison_table(self, html):
|
|
|
|
"""Return a table showing the means/SDs for each group"""
|
|
|
|
|
|
|
|
table_data = {
|
|
|
|
self.group1: '{:.2f} ({:.2f})'.format(self.mu1, self.sd1),
|
|
|
|
self.group2: '{:.2f} ({:.2f})'.format(self.mu2, self.sd2),
|
|
|
|
}
|
|
|
|
|
|
|
|
if html:
|
|
|
|
table = pd.DataFrame(table_data, index=['\ue000 (SD)']) # U+E000 is in Private Use Area, mark μ symbol
|
|
|
|
table_str = table._repr_html_()
|
|
|
|
return table_str.replace('\ue000', '<i>μ</i>')
|
|
|
|
else:
|
|
|
|
table = pd.DataFrame(table_data, index=['μ (SD)'])
|
|
|
|
return str(table)
|
|
|
|
|
2022-10-18 18:44:04 +11:00
|
|
|
def __repr__(self):
|
|
|
|
if config.repr_is_summary:
|
|
|
|
return self.summary()
|
|
|
|
return super().__repr__()
|
|
|
|
|
2022-10-11 22:52:42 +11:00
|
|
|
def _repr_html_(self):
|
2022-11-09 18:18:47 +11:00
|
|
|
return '{}<br><i>t</i>({:.0f}) = {:.2f}; <i>p</i> {}<br>Δ<i>μ</i> ({:g}% CI) = {}, {}'.format(self._comparison_table(True), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML), (1-config.alpha)*100, self.delta.summary(), self.delta_direction)
|
2022-10-11 22:52:42 +11:00
|
|
|
|
|
|
|
def summary(self):
|
2022-10-17 21:41:19 +11:00
|
|
|
"""
|
|
|
|
Return a stringified summary of the *t* test
|
|
|
|
|
|
|
|
:rtype: str
|
|
|
|
"""
|
|
|
|
|
2022-11-09 18:18:47 +11:00
|
|
|
return '{}\n\nt({:.0f}) = {:.2f}; p {}\nΔμ ({:g}% CI) = {}, {}'.format(self._comparison_table(False), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION), (1-config.alpha)*100, self.delta.summary(), self.delta_direction)
|
2022-10-11 22:52:42 +11:00
|
|
|
|
|
|
|
def ttest_ind(df, dep, ind, *, nan_policy='warn'):
|
2022-10-17 21:41:19 +11:00
|
|
|
"""
|
|
|
|
Perform an independent 2-sample Student's *t* test
|
|
|
|
|
|
|
|
:param df: Data to perform the test on
|
|
|
|
:type df: DataFrame
|
|
|
|
:param dep: Column in *df* for the dependent variable (numeric)
|
|
|
|
:type dep: str
|
|
|
|
:param ind: Column in *df* for the independent variable (dichotomous)
|
|
|
|
:type ind: str
|
|
|
|
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
|
|
|
|
:type nan_policy: str
|
|
|
|
|
|
|
|
:rtype: :class:`yli.sig_tests.TTestResult`
|
2022-10-18 19:25:12 +11:00
|
|
|
|
|
|
|
**Example:**
|
|
|
|
|
|
|
|
.. code-block::
|
|
|
|
|
|
|
|
df = pd.DataFrame({
|
|
|
|
'Type': ['Fresh'] * 10 + ['Stored'] * 10,
|
|
|
|
'Potency': [10.2, 10.5, 10.3, ...]
|
|
|
|
})
|
|
|
|
yli.ttest_ind(df, 'Potency', 'Type')
|
|
|
|
|
|
|
|
.. code-block:: text
|
2022-11-09 18:18:47 +11:00
|
|
|
Fresh Stored
|
|
|
|
μ (SD) 10.37 (0.32) 9.83 (0.24)
|
2022-10-18 19:25:12 +11:00
|
|
|
|
|
|
|
t(18) = 4.24; p < 0.001*
|
|
|
|
Δμ (95% CI) = 0.54 (0.27–0.81), Fresh > Stored
|
|
|
|
|
|
|
|
The output states that the value of the *t* statistic is 4.24, the *t* distribution has 18 degrees of freedom, and the test is significant with *p* value < 0.001.
|
|
|
|
The mean difference is 0.54 in favour of the *Fresh* group, with 95% confidence interval 0.27–0.81.
|
2022-10-17 21:41:19 +11:00
|
|
|
"""
|
2022-10-11 22:52:42 +11:00
|
|
|
|
2022-10-13 12:53:18 +11:00
|
|
|
# Check for/clean NaNs
|
2022-10-11 22:52:42 +11:00
|
|
|
df = check_nan(df[[ind, dep]], nan_policy)
|
|
|
|
|
2022-10-13 12:53:18 +11:00
|
|
|
# Ensure 2 groups for ind
|
|
|
|
group1, data1, group2, data2 = as_2groups(df, dep, ind)
|
2022-10-11 22:52:42 +11:00
|
|
|
|
|
|
|
# Do t test
|
|
|
|
# Use statsmodels rather than SciPy because this provides the mean difference automatically
|
2022-11-09 17:01:45 +11:00
|
|
|
d1 = sm.stats.DescrStatsW(data1, ddof=1)
|
|
|
|
d2 = sm.stats.DescrStatsW(data2, ddof=1)
|
2022-10-11 22:52:42 +11:00
|
|
|
|
2022-10-13 12:53:18 +11:00
|
|
|
cm = sm.stats.CompareMeans(d1, d2)
|
2022-10-11 22:52:42 +11:00
|
|
|
statistic, pvalue, dof = cm.ttest_ind()
|
|
|
|
|
2022-10-13 12:53:18 +11:00
|
|
|
delta = d1.mean - d2.mean
|
2022-10-15 23:24:29 +11:00
|
|
|
ci0, ci1 = cm.tconfint_diff(config.alpha)
|
2022-10-11 22:52:42 +11:00
|
|
|
|
2022-11-09 17:01:45 +11:00
|
|
|
# t test is symmetric so take absolute value
|
|
|
|
if d2.mean > d1.mean:
|
|
|
|
delta, ci0, ci1 = -delta, -ci1, -ci0
|
|
|
|
d1, d2 = d2, d1
|
|
|
|
group1, group2 = group2, group1
|
|
|
|
# Now group1 > group2
|
|
|
|
|
2022-10-13 12:53:18 +11:00
|
|
|
return TTestResult(
|
|
|
|
statistic=abs(statistic), dof=dof, pvalue=pvalue,
|
2022-11-09 17:01:45 +11:00
|
|
|
group1=group1, group2=group2, mu1=d1.mean, mu2=d2.mean, sd1=d1.std, sd2=d2.std,
|
|
|
|
delta=Estimate(delta, ci0, ci1),
|
|
|
|
delta_direction='{} > {}'.format(group1, group2))
|
2022-10-13 13:14:51 +11:00
|
|
|
|
2022-10-14 20:18:25 +11:00
|
|
|
# -------------
|
|
|
|
# One-way ANOVA
|
|
|
|
|
|
|
|
class FTestResult:
|
2022-10-17 21:41:19 +11:00
|
|
|
"""
|
|
|
|
Result of an *F* test for ANOVA/regression
|
|
|
|
|
|
|
|
See :func:`yli.anova_oneway` and :meth:`yli.regress.RegressionResult.ftest`.
|
|
|
|
"""
|
2022-10-14 20:18:25 +11:00
|
|
|
|
|
|
|
def __init__(self, statistic, dof1, dof2, pvalue):
|
2022-10-17 21:41:19 +11:00
|
|
|
#: *F* statistic (*float*)
|
2022-10-14 20:18:25 +11:00
|
|
|
self.statistic = statistic
|
2022-10-17 21:41:19 +11:00
|
|
|
#: Degrees of freedom in the *F* distribution numerator (*int*)
|
2022-10-14 20:18:25 +11:00
|
|
|
self.dof1 = dof1
|
2022-10-17 21:41:19 +11:00
|
|
|
#: Degrees of freedom in the *F* distribution denominator (*int*)
|
2022-10-14 20:18:25 +11:00
|
|
|
self.dof2 = dof2
|
2022-10-17 21:41:19 +11:00
|
|
|
#: *p* value for the *F* statistic (*float*)
|
2022-10-14 20:18:25 +11:00
|
|
|
self.pvalue = pvalue
|
|
|
|
|
2022-10-18 18:44:04 +11:00
|
|
|
def __repr__(self):
|
|
|
|
if config.repr_is_summary:
|
|
|
|
return self.summary()
|
|
|
|
return super().__repr__()
|
|
|
|
|
2022-10-14 20:18:25 +11:00
|
|
|
def _repr_html_(self):
|
2022-11-09 17:39:33 +11:00
|
|
|
return '<i>F</i>({:.0f}, {:.0f}) = {:.2f}; <i>p</i> {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
|
2022-10-14 20:18:25 +11:00
|
|
|
|
|
|
|
def summary(self):
|
2022-10-17 21:41:19 +11:00
|
|
|
"""
|
|
|
|
Return a stringified summary of the *F* test
|
|
|
|
|
|
|
|
:rtype: str
|
|
|
|
"""
|
|
|
|
|
2022-11-09 17:39:33 +11:00
|
|
|
return 'F({:.0f}, {:.0f}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION))
|
2022-10-14 20:18:25 +11:00
|
|
|
|
|
|
|
def anova_oneway(df, dep, ind, *, nan_policy='omit'):
|
2022-10-17 21:41:19 +11:00
|
|
|
"""
|
|
|
|
Perform one-way ANOVA
|
|
|
|
|
|
|
|
:param df: Data to perform the test on
|
|
|
|
:type df: DataFrame
|
|
|
|
:param dep: Column in *df* for the dependent variable (numeric)
|
|
|
|
:type dep: str
|
|
|
|
:param ind: Column in *df* for the independent variable (categorical)
|
|
|
|
:type ind: str
|
|
|
|
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
|
|
|
|
:type nan_policy: str
|
|
|
|
|
|
|
|
:rtype: :class:`yli.sig_tests.FTestResult`
|
2022-10-18 19:25:12 +11:00
|
|
|
|
|
|
|
**Example:**
|
|
|
|
|
|
|
|
.. code-block::
|
|
|
|
|
|
|
|
df = pd.DataFrame({
|
|
|
|
'Method': [1]*8 + [2]*7 + [3]*9,
|
|
|
|
'Score': [96, 79, 91, ...]
|
|
|
|
})
|
|
|
|
yli.anova_oneway(df, 'Score', 'Method')
|
|
|
|
|
|
|
|
.. code-block:: text
|
|
|
|
|
|
|
|
F(2, 21) = 29.57; p < 0.001*
|
|
|
|
|
|
|
|
The output states that the value of the *F* statistic is 29.57, the *F* distribution has 2 degrees of freedom in the numerator and 21 in the denominator, and the test is significant with *p* value < 0.001.
|
2022-10-17 21:41:19 +11:00
|
|
|
"""
|
2022-10-14 20:18:25 +11:00
|
|
|
|
|
|
|
# Check for/clean NaNs
|
|
|
|
df = check_nan(df[[ind, dep]], nan_policy)
|
|
|
|
|
|
|
|
# Group by independent variable
|
|
|
|
groups = df.groupby(ind)[dep]
|
|
|
|
|
|
|
|
# Perform one-way ANOVA
|
|
|
|
result = stats.f_oneway(*[groups.get_group(k) for k in groups.groups])
|
|
|
|
|
|
|
|
# See stats.f_oneway implementation
|
|
|
|
dfbn = len(groups.groups) - 1
|
|
|
|
dfwn = len(df) - len(groups.groups)
|
|
|
|
|
|
|
|
return FTestResult(result.statistic, dfbn, dfwn, result.pvalue)
|
|
|
|
|
2022-10-13 13:14:51 +11:00
|
|
|
# -----------------
|
|
|
|
# Mann-Whitney test
|
|
|
|
|
|
|
|
class MannWhitneyResult:
|
|
|
|
"""
|
2022-10-17 21:41:19 +11:00
|
|
|
Result of a Mann-Whitney *U* test
|
2022-10-13 13:14:51 +11:00
|
|
|
|
2022-10-17 21:41:19 +11:00
|
|
|
See :func:`yli.mannwhitney`.
|
2022-10-13 13:14:51 +11:00
|
|
|
"""
|
|
|
|
|
2022-11-09 22:08:27 +11:00
|
|
|
def __init__(self, statistic, pvalue, group1, group2, med1, med2, iqr1, iqr2, range1, range2, rank_biserial, direction, brunnermunzel=None):
|
2022-10-18 19:25:12 +11:00
|
|
|
#: Lesser of the two Mann–Whitney *U* statistics (*float*)
|
2022-10-13 13:14:51 +11:00
|
|
|
self.statistic = statistic
|
2022-10-17 21:41:19 +11:00
|
|
|
#: *p* value for the *U* statistic (*float*)
|
2022-10-13 13:14:51 +11:00
|
|
|
self.pvalue = pvalue
|
2022-11-09 22:08:27 +11:00
|
|
|
|
|
|
|
#: Name of the first group (*str*)
|
|
|
|
self.group1 = group1
|
|
|
|
#: Name of the second group (*str*)
|
|
|
|
self.group2 = group2
|
|
|
|
#: Median of the first group (*float*)
|
|
|
|
self.med1 = med1
|
|
|
|
#: Median of the second group (*float*)
|
|
|
|
self.med2 = med2
|
|
|
|
#: Interquartile range of the first group (:class:`yli.utils.ConfidenceInterval`)
|
|
|
|
self.iqr1 = iqr1
|
|
|
|
#: Interquartile range of the second group (:class:`yli.utils.ConfidenceInterval`)
|
|
|
|
self.iqr2 = iqr2
|
|
|
|
#: Range of the first group (:class:`yli.utils.ConfidenceInterval`)
|
|
|
|
self.range1 = range2
|
|
|
|
#: Range of the second group (:class:`yli.utils.ConfidenceInterval`)
|
|
|
|
self.range2 = range2
|
|
|
|
|
2022-10-17 21:41:19 +11:00
|
|
|
#: Absolute value of the rank-biserial correlation (*float*)
|
2022-10-13 13:14:51 +11:00
|
|
|
self.rank_biserial = rank_biserial
|
2022-10-17 21:41:19 +11:00
|
|
|
#: Description of the direction of the effect (*str*)
|
2022-10-13 13:14:51 +11:00
|
|
|
self.direction = direction
|
2022-11-09 22:08:27 +11:00
|
|
|
|
2022-10-17 21:41:19 +11:00
|
|
|
#: :class:`BrunnerMunzelResult` on the same data, or *None* if N/A
|
2022-10-13 13:14:51 +11:00
|
|
|
self.brunnermunzel = brunnermunzel
|
|
|
|
|
2022-11-09 22:08:27 +11:00
|
|
|
def _comparison_table(self, html):
|
|
|
|
"""Return a table showing the medians/IQRs/ranges for each group"""
|
|
|
|
|
|
|
|
table_data = {
|
|
|
|
self.group1: ['{:.2f} ({})'.format(self.med1, self.iqr1.summary()), '{:.2f} ({})'.format(self.med1, self.range1.summary())],
|
|
|
|
self.group2: ['{:.2f} ({})'.format(self.med2, self.iqr2.summary()), '{:.2f} ({})'.format(self.med2, self.range2.summary())],
|
|
|
|
}
|
|
|
|
table = pd.DataFrame(table_data, index=['Median (IQR)', 'Median (range)'])
|
|
|
|
|
|
|
|
if html:
|
|
|
|
return table._repr_html_()
|
|
|
|
else:
|
|
|
|
return str(table)
|
|
|
|
|
2022-10-18 18:44:04 +11:00
|
|
|
def __repr__(self):
|
|
|
|
if config.repr_is_summary:
|
|
|
|
return self.summary()
|
|
|
|
return super().__repr__()
|
|
|
|
|
2022-10-13 13:14:51 +11:00
|
|
|
def _repr_html_(self):
|
2022-11-09 22:08:27 +11:00
|
|
|
line1 = '{}<br><i>U</i> = {:.1f}; <i>p</i> {}<br><i>r</i> = {:.2f}, {}'.format(self._comparison_table(True), self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML), self.rank_biserial, self.direction)
|
2022-10-13 13:14:51 +11:00
|
|
|
if self.brunnermunzel:
|
|
|
|
return line1 + '<br>' + self.brunnermunzel._repr_html_()
|
|
|
|
else:
|
|
|
|
return line1
|
|
|
|
|
|
|
|
def summary(self):
|
2022-10-17 21:41:19 +11:00
|
|
|
"""
|
|
|
|
Return a stringified summary of the Mann–Whitney test
|
|
|
|
|
|
|
|
:rtype: str
|
|
|
|
"""
|
|
|
|
|
2022-11-09 22:08:27 +11:00
|
|
|
line1 = '{}\n\nU = {:.1f}; p {}\nr = {:.2f}, {}'.format(self._comparison_table(False), self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION), self.rank_biserial, self.direction)
|
2022-10-13 13:14:51 +11:00
|
|
|
if self.brunnermunzel:
|
|
|
|
return line1 + '\n' + self.brunnermunzel.summary()
|
|
|
|
else:
|
|
|
|
return line1
|
|
|
|
|
|
|
|
class BrunnerMunzelResult:
|
2022-10-17 21:41:19 +11:00
|
|
|
"""
|
|
|
|
Result of a Brunner–Munzel test
|
|
|
|
|
|
|
|
See :func:`yli.mannwhitney`. This library calls the Brunner–Munzel test statistic *W*.
|
|
|
|
"""
|
|
|
|
|
2022-10-13 13:14:51 +11:00
|
|
|
"""Result of a Brunner-Munzel test"""
|
|
|
|
|
|
|
|
def __init__(self, statistic, pvalue):
|
2022-10-17 21:41:19 +11:00
|
|
|
#: *W* statistic (*float*)
|
2022-10-13 13:14:51 +11:00
|
|
|
self.statistic = statistic
|
2022-10-17 21:41:19 +11:00
|
|
|
#: *p* value for the *W* statistic (*float*)
|
2022-10-13 13:14:51 +11:00
|
|
|
self.pvalue = pvalue
|
|
|
|
|
2022-10-18 18:44:04 +11:00
|
|
|
def __repr__(self):
|
|
|
|
if config.repr_is_summary:
|
|
|
|
return self.summary()
|
|
|
|
return super().__repr__()
|
|
|
|
|
2022-10-13 13:14:51 +11:00
|
|
|
def _repr_html_(self):
|
2022-11-09 17:39:33 +11:00
|
|
|
return '<i>W</i> = {:.1f}; <i>p</i> {}'.format(self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
|
2022-10-13 13:14:51 +11:00
|
|
|
|
|
|
|
def summary(self):
|
2022-10-17 21:41:19 +11:00
|
|
|
"""
|
|
|
|
Return a stringified summary of the Brunner–Munzel test
|
|
|
|
|
|
|
|
:rtype: str
|
|
|
|
"""
|
|
|
|
|
2022-11-09 17:39:33 +11:00
|
|
|
return 'W = {:.1f}; p {}'.format(self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION))
|
2022-10-13 13:14:51 +11:00
|
|
|
|
|
|
|
def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_continuity=False, alternative='two-sided', method='auto'):
|
|
|
|
"""
|
2022-10-17 21:41:19 +11:00
|
|
|
Perform a Mann-Whitney *U* test
|
|
|
|
|
|
|
|
By default, this function performs a Brunner–Munzel test if the Mann–Whitney test is significant.
|
|
|
|
If the Mann–Whitney test is significant but the Brunner–Munzel test is not, a warning is raised.
|
|
|
|
The Brunner–Munzel test is returned only if non-significant.
|
|
|
|
|
|
|
|
:param df: Data to perform the test on
|
|
|
|
:type df: DataFrame
|
|
|
|
:param dep: Column in *df* for the dependent variable (numeric)
|
|
|
|
:type dep: str
|
|
|
|
:param ind: Column in *df* for the independent variable (dichotomous)
|
|
|
|
:type ind: str
|
|
|
|
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
|
|
|
|
:type nan_policy: str
|
|
|
|
:param brunnermunzel: Whether to compute the Brunner–Munzel test if the Mann–Whitney test is significant
|
|
|
|
:type brunnermunzel: bool
|
|
|
|
:param use_continuity: See *scipy.stats.mannwhitneyu*
|
|
|
|
:param alternative: See *scipy.stats.mannwhitneyu*
|
|
|
|
:param method: See *scipy.stats.mannwhitneyu*
|
|
|
|
|
2022-11-09 17:05:04 +11:00
|
|
|
:return: The result of the Mann–Whitney test. The result of a Brunner–Munzel test is included in the result object if and only if *brunnermunzel* is *True*, *and* the Mann–Whitney test is significant, *and* the Brunner–Munzel test is non-significant.
|
2022-10-20 20:57:24 +11:00
|
|
|
|
2022-10-17 21:41:19 +11:00
|
|
|
:rtype: :class:`yli.sig_tests.MannWhitneyResult`
|
2022-10-18 19:25:12 +11:00
|
|
|
|
|
|
|
**Example:**
|
|
|
|
|
|
|
|
.. code-block::
|
|
|
|
|
|
|
|
df = pd.DataFrame({
|
|
|
|
'Sample': ['Before'] * 12 + ['After'] * 12,
|
|
|
|
'Oxygen': [11.0, 11.2, 11.2, ...]
|
|
|
|
})
|
|
|
|
yli.mannwhitney(df, 'Oxygen', 'Sample', method='asymptotic', alternative='less')
|
|
|
|
|
|
|
|
.. code-block:: text
|
|
|
|
|
2022-11-09 22:08:27 +11:00
|
|
|
After Before
|
|
|
|
Median (IQR) 10.75 (10.55–10.95) 11.55 (11.20–11.83)
|
|
|
|
Median (range) 10.75 (11.00–12.10) 11.55 (11.00–12.10)
|
|
|
|
|
2022-10-18 19:25:12 +11:00
|
|
|
U = 6.0; p < 0.001*
|
|
|
|
r = 0.92, Before > After
|
|
|
|
|
|
|
|
The output states that the value of the Mann–Whitney *U* statistic is 6.0, and the one-sided test is significant with asymptotic *p* value < 0.001.
|
|
|
|
The rank-biserial correlation is 0.92 in favour of the *Before* group.
|
2022-10-13 13:14:51 +11:00
|
|
|
"""
|
|
|
|
|
|
|
|
# Check for/clean NaNs
|
|
|
|
df = check_nan(df[[ind, dep]], nan_policy)
|
|
|
|
|
2022-10-20 20:58:42 +11:00
|
|
|
# Convert pandas nullable types for independent variables as this breaks statsmodels
|
|
|
|
df = convert_pandas_nullable(df)
|
|
|
|
|
2022-10-13 13:14:51 +11:00
|
|
|
# Ensure 2 groups for ind
|
|
|
|
group1, data1, group2, data2 = as_2groups(df, dep, ind)
|
|
|
|
|
|
|
|
# Do Mann-Whitney test
|
|
|
|
# Stata does not perform continuity correction
|
|
|
|
result = stats.mannwhitneyu(data1, data2, use_continuity=use_continuity, alternative=alternative, method=method)
|
|
|
|
|
|
|
|
u1 = result.statistic
|
|
|
|
u2 = len(data1) * len(data2) - u1
|
|
|
|
r = abs(2*u1 / (len(data1) * len(data2)) - 1) # rank-biserial
|
|
|
|
|
|
|
|
# If significant, perform a Brunner-Munzel test for our interest
|
|
|
|
if result.pvalue < 0.05 and brunnermunzel:
|
|
|
|
result_bm = stats.brunnermunzel(data1, data2)
|
|
|
|
|
|
|
|
if result_bm.pvalue >= 0.05:
|
|
|
|
warnings.warn('Mann-Whitney test is significant but Brunner-Munzel test is not. This could be due to a difference in shape, rather than location.')
|
|
|
|
|
|
|
|
return MannWhitneyResult(
|
|
|
|
statistic=min(u1, u2), pvalue=result.pvalue,
|
|
|
|
#med1=data1.median(), med2=data2.median(),
|
|
|
|
rank_biserial=r, direction=('{1} > {0}' if u1 < u2 else '{0} > {1}').format(group1, group2),
|
|
|
|
brunnermunzel=BrunnerMunzelResult(statistic=result_bm.statistic, pvalue=result_bm.pvalue))
|
|
|
|
|
|
|
|
return MannWhitneyResult(
|
|
|
|
statistic=min(u1, u2), pvalue=result.pvalue,
|
2022-11-09 22:08:27 +11:00
|
|
|
group1=group1, group2=group2,
|
|
|
|
med1=data1.median(), med2=data2.median(), iqr1=ConfidenceInterval(data1.quantile(0.25), data1.quantile(0.75)), iqr2=ConfidenceInterval(data2.quantile(0.25), data2.quantile(0.75)), range1=ConfidenceInterval(data1.min(), data1.max()), range2=ConfidenceInterval(data2.min(), data2.max()),
|
2022-10-13 13:14:51 +11:00
|
|
|
rank_biserial=r, direction=('{1} > {0}' if u1 < u2 else '{0} > {1}').format(group1, group2))
|
2022-10-13 13:25:24 +11:00
|
|
|
|
|
|
|
# ------------------------
|
|
|
|
# Pearson chi-squared test
|
|
|
|
|
|
|
|
class PearsonChiSquaredResult:
|
2022-10-17 21:41:19 +11:00
|
|
|
"""
|
|
|
|
Result of a Pearson *χ*:sup:`2` test
|
|
|
|
|
|
|
|
See :func:`yli.chi2`.
|
|
|
|
"""
|
2022-10-13 13:25:24 +11:00
|
|
|
|
|
|
|
def __init__(self, ct, statistic, dof, pvalue, oddsratio=None, riskratio=None):
|
2022-10-17 21:41:19 +11:00
|
|
|
#: Contingency table for the observations (*DataFrame*)
|
2022-10-13 13:25:24 +11:00
|
|
|
self.ct = ct
|
2022-10-17 21:41:19 +11:00
|
|
|
#: *χ*:sup:`2` statistic (*float*)
|
2022-10-13 13:25:24 +11:00
|
|
|
self.statistic = statistic
|
2022-10-17 21:41:19 +11:00
|
|
|
#: Degrees of freedom for the *χ*:sup:`2` distribution (*int*)
|
2022-10-13 13:25:24 +11:00
|
|
|
self.dof = dof
|
2022-10-17 21:41:19 +11:00
|
|
|
#: *p* value for the *χ*:sup:`2` test (*float*)
|
2022-10-13 13:25:24 +11:00
|
|
|
self.pvalue = pvalue
|
2022-10-17 21:41:19 +11:00
|
|
|
#: Odds ratio (*float*; *None* if not a 2×2 table)
|
2022-10-13 13:25:24 +11:00
|
|
|
self.oddsratio = oddsratio
|
2022-10-17 21:41:19 +11:00
|
|
|
#: Risk ratio (*float*; *None* if not a 2×2 table)
|
2022-10-13 13:25:24 +11:00
|
|
|
self.riskratio = riskratio
|
|
|
|
|
2022-10-18 18:44:04 +11:00
|
|
|
def __repr__(self):
|
|
|
|
if config.repr_is_summary:
|
|
|
|
return self.summary()
|
|
|
|
return super().__repr__()
|
|
|
|
|
2022-10-13 13:25:24 +11:00
|
|
|
def _repr_html_(self):
|
|
|
|
if self.oddsratio is not None:
|
2022-10-15 23:24:29 +11:00
|
|
|
return '{0}<br><i>χ</i><sup>2</sup>({1}) = {2:.2f}; <i>p</i> {3}<br>OR ({4:g}% CI) = {5}<br>RR ({4:g}% CI) = {6}'.format(
|
2022-11-09 17:39:33 +11:00
|
|
|
self.ct._repr_html_(), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML), (1-config.alpha)*100, self.oddsratio.summary(), self.riskratio.summary())
|
2022-10-13 13:25:24 +11:00
|
|
|
else:
|
|
|
|
return '{}<br><i>χ</i><sup>2</sup>({}) = {:.2f}; <i>p</i> {}'.format(
|
2022-11-09 17:39:33 +11:00
|
|
|
self.ct._repr_html_(), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
|
2022-10-13 13:25:24 +11:00
|
|
|
|
|
|
|
def summary(self):
|
2022-10-17 21:41:19 +11:00
|
|
|
"""
|
|
|
|
Return a stringified summary of the *χ*:sup:`2` test
|
|
|
|
|
|
|
|
:rtype: str
|
|
|
|
"""
|
|
|
|
|
2022-10-13 13:25:24 +11:00
|
|
|
if self.oddsratio is not None:
|
2022-10-18 19:25:12 +11:00
|
|
|
return '{0}\n\nχ²({1}) = {2:.2f}; p {3}\nOR ({4:g}% CI) = {5}\nRR ({4:g}% CI) = {6}'.format(
|
2022-11-09 17:39:33 +11:00
|
|
|
self.ct, self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION), (1-config.alpha)*100, self.oddsratio.summary(), self.riskratio.summary())
|
2022-10-13 13:25:24 +11:00
|
|
|
else:
|
2022-10-18 19:25:12 +11:00
|
|
|
return '{}\n\nχ²({}) = {:.2f}; p {}'.format(
|
2022-11-09 17:39:33 +11:00
|
|
|
self.ct, self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION))
|
2022-10-13 13:25:24 +11:00
|
|
|
|
|
|
|
def chi2(df, dep, ind, *, nan_policy='warn'):
|
2022-10-17 21:41:19 +11:00
|
|
|
"""
|
|
|
|
Perform a Pearson *χ*:sup:`2` test
|
|
|
|
|
|
|
|
If a 2×2 contingency table is obtained (i.e. if both variables are dichotomous), an odds ratio and risk ratio are calculated.
|
2022-10-18 19:25:12 +11:00
|
|
|
The ratios are calculated for the higher-valued value in each variable (i.e. *True* compared with *False* for a boolean).
|
|
|
|
The risk ratio is calculated relative to the independent variable (rows of the contingency table).
|
2022-10-17 21:41:19 +11:00
|
|
|
|
|
|
|
:param df: Data to perform the test on
|
|
|
|
:type df: DataFrame
|
|
|
|
:param dep: Column in *df* for the dependent variable (categorical)
|
|
|
|
:type dep: str
|
|
|
|
:param ind: Column in *df* for the independent variable (categorical)
|
|
|
|
:type ind: str
|
|
|
|
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
|
|
|
|
:type nan_policy: str
|
|
|
|
|
|
|
|
:rtype: :class:`yli.sig_tests.PearsonChiSquaredResult`
|
2022-10-18 19:25:12 +11:00
|
|
|
|
|
|
|
**Example:**
|
|
|
|
|
|
|
|
.. code-block::
|
|
|
|
|
|
|
|
df = pd.DataFrame({
|
|
|
|
'Response': np.repeat([False, True, False, True], [250, 750, 400, 1600]),
|
|
|
|
'Stress': np.repeat([False, False, True, True], [250, 750, 400, 1600])
|
|
|
|
})
|
|
|
|
yli.chi2(df, 'Stress', 'Response')
|
|
|
|
|
|
|
|
.. code-block:: text
|
|
|
|
|
|
|
|
Stress False True
|
|
|
|
Response
|
|
|
|
False 250 400
|
|
|
|
True 750 1600
|
|
|
|
|
|
|
|
χ²(1) = 9.82; p = 0.002*
|
|
|
|
OR (95% CI) = 1.33 (1.11–1.60)
|
|
|
|
RR (95% CI) = 1.11 (1.03–1.18)
|
|
|
|
|
|
|
|
The output shows the contingency table, and states that the value of the Pearson *χ*:sup:`2` statistic is 9.82, the *χ*:sup:`2` distribution has 1 degree of freedom, and the test is significant with *p* value 0.002.
|
|
|
|
|
|
|
|
The odds of *Stress* in the *Response* = *True* group are 1.33 times that in the *Response* = *False* group, with 95% confidence interval 1.11–1.60.
|
|
|
|
|
|
|
|
The risk of *Stress* in the *Response* = *True* group is 1.11 that in the *Response* = *False* group, with 95% confidence interval 1.03–1.18.
|
2022-10-17 21:41:19 +11:00
|
|
|
"""
|
2022-10-13 13:25:24 +11:00
|
|
|
|
|
|
|
# Check for/clean NaNs
|
|
|
|
df = check_nan(df[[ind, dep]], nan_policy)
|
|
|
|
|
|
|
|
# Compute contingency table
|
|
|
|
ct = pd.crosstab(df[ind], df[dep])
|
|
|
|
|
|
|
|
# Get expected counts
|
|
|
|
expected = stats.contingency.expected_freq(ct)
|
|
|
|
|
|
|
|
# Warn on low expected counts
|
|
|
|
if (expected < 5).sum() / expected.size > 0.2:
|
|
|
|
warnings.warn('{} of {} cells ({:.0f}%) have expected count < 5'.format((expected < 5).sum(), expected.size, (expected < 5).sum() / expected.size * 100))
|
|
|
|
if (expected < 1).any():
|
|
|
|
warnings.warn('{} cells have expected count < 1'.format((expected < 1).sum()))
|
|
|
|
|
|
|
|
if ct.shape == (2,2):
|
|
|
|
# 2x2 table
|
2022-10-14 20:08:01 +11:00
|
|
|
# Use statsmodels to get OR and RR
|
2022-10-13 13:25:24 +11:00
|
|
|
|
|
|
|
smct = sm.stats.Table2x2(np.flip(ct.to_numpy()), shift_zeros=False)
|
|
|
|
result = smct.test_nominal_association()
|
2022-10-15 23:24:29 +11:00
|
|
|
ORci = smct.oddsratio_confint(config.alpha)
|
|
|
|
RRci = smct.riskratio_confint(config.alpha)
|
2022-10-13 13:25:24 +11:00
|
|
|
|
|
|
|
return PearsonChiSquaredResult(
|
|
|
|
ct=ct, statistic=result.statistic, dof=result.df, pvalue=result.pvalue,
|
|
|
|
oddsratio=Estimate(smct.oddsratio, ORci[0], ORci[1]), riskratio=Estimate(smct.riskratio, RRci[0], RRci[1]))
|
|
|
|
else:
|
|
|
|
# rxc table
|
|
|
|
# Just use SciPy
|
|
|
|
|
|
|
|
result = stats.chi2_contingency(ct, correction=False)
|
|
|
|
|
|
|
|
return PearsonChiSquaredResult(ct=ct, statistic=result[0], dof=result[2], pvalue=result[1])
|
2022-10-14 20:18:25 +11:00
|
|
|
|
|
|
|
# -------------------
|
|
|
|
# Pearson correlation
|
|
|
|
|
|
|
|
class PearsonRResult:
|
2022-10-17 21:41:19 +11:00
|
|
|
"""
|
|
|
|
Result of Pearson correlation
|
|
|
|
|
|
|
|
See :func:`yli.pearsonr`.
|
|
|
|
"""
|
2022-10-14 20:18:25 +11:00
|
|
|
|
|
|
|
def __init__(self, statistic, pvalue):
|
2022-10-17 21:41:19 +11:00
|
|
|
#: Pearson *r* correlation statistic (*float*)
|
2022-10-14 20:18:25 +11:00
|
|
|
self.statistic = statistic
|
2022-10-17 21:41:19 +11:00
|
|
|
#: *p* value for the *r* statistic (*float*)
|
2022-10-14 20:18:25 +11:00
|
|
|
self.pvalue = pvalue
|
|
|
|
|
2022-10-18 18:44:04 +11:00
|
|
|
def __repr__(self):
|
|
|
|
if config.repr_is_summary:
|
|
|
|
return self.summary()
|
|
|
|
return super().__repr__()
|
|
|
|
|
2022-10-14 20:18:25 +11:00
|
|
|
def _repr_html_(self):
|
2022-11-09 17:39:33 +11:00
|
|
|
return '<i>r</i> ({:g}% CI) = {}; <i>p</i> {}'.format((1-config.alpha)*100, self.statistic.summary(), fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
|
2022-10-14 20:18:25 +11:00
|
|
|
|
|
|
|
def summary(self):
|
2022-10-17 21:41:19 +11:00
|
|
|
"""
|
|
|
|
Return a stringified summary of the Pearson correlation
|
|
|
|
|
|
|
|
:rtype: str
|
|
|
|
"""
|
|
|
|
|
2022-11-09 17:39:33 +11:00
|
|
|
return 'r ({:g}% CI) = {}; p {}'.format((1-config.alpha)*100, self.statistic.summary(), fmt_p(self.pvalue, PValueStyle.RELATION))
|
2022-10-14 20:18:25 +11:00
|
|
|
|
|
|
|
def pearsonr(df, dep, ind, *, nan_policy='warn'):
|
2022-10-17 21:41:19 +11:00
|
|
|
"""
|
2022-10-18 19:25:12 +11:00
|
|
|
Compute the Pearson correlation coefficient (Pearson's *r*)
|
2022-10-17 21:41:19 +11:00
|
|
|
|
|
|
|
:param df: Data to perform the test on
|
|
|
|
:type df: DataFrame
|
|
|
|
:param dep: Column in *df* for the dependent variable (numerical)
|
|
|
|
:type dep: str
|
|
|
|
:param ind: Column in *df* for the independent variable (numerical)
|
|
|
|
:type ind: str
|
|
|
|
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
|
|
|
|
:type nan_policy: str
|
|
|
|
|
|
|
|
:rtype: :class:`yli.sig_tests.PearsonRResult`
|
2022-10-20 20:57:57 +11:00
|
|
|
|
|
|
|
**Example:**
|
|
|
|
|
|
|
|
.. code-block::
|
|
|
|
|
|
|
|
df = pd.DataFrame({
|
|
|
|
'y': [41, 39, 47, 51, 43, 40, 57, 46, 50, 59, 61, 52],
|
|
|
|
'x': [24, 30, 33, 35, 36, 36, 37, 37, 38, 40, 43, 49]
|
|
|
|
})
|
|
|
|
yli.pearsonr(df, 'y', 'x')
|
|
|
|
|
|
|
|
.. code-block:: text
|
|
|
|
|
|
|
|
r (95% CI) = 0.65 (0.11–0.89); p = 0.02*
|
|
|
|
|
|
|
|
The output states that the value of the Pearson correlation coefficient is 0.65, with 95% confidence interval 0.11–0.89, and the test is significant with *p* value 0.02.
|
2022-10-17 21:41:19 +11:00
|
|
|
"""
|
2022-10-14 20:18:25 +11:00
|
|
|
|
|
|
|
# Check for/clean NaNs
|
|
|
|
df = check_nan(df[[ind, dep]], nan_policy)
|
|
|
|
|
|
|
|
# Compute Pearson's r
|
|
|
|
result = stats.pearsonr(df[ind], df[dep])
|
|
|
|
ci = result.confidence_interval()
|
|
|
|
|
|
|
|
return PearsonRResult(statistic=Estimate(result.statistic, ci.low, ci.high), pvalue=result.pvalue)
|