Implement ttest_ind
This commit is contained in:
parent
97a01b4e80
commit
6b43034a50
40
tests/test_ttest.py
Normal file
40
tests/test_ttest.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
# scipy-yli: Helpful SciPy utilities and recipes
|
||||||
|
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
|
||||||
|
#
|
||||||
|
# This program is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU Affero General Public License as published by
|
||||||
|
# the Free Software Foundation, either version 3 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU Affero General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Affero General Public License
|
||||||
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
from pytest import approx
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
import yli
|
||||||
|
|
||||||
|
def test_ttest_ind_ol6_1():
|
||||||
|
"""Compare yli.ttest_ind for Ott & Longnecker (2016) example 6.1"""
|
||||||
|
|
||||||
|
df = pd.DataFrame({
|
||||||
|
'Type': ['Fresh'] * 10 + ['Stored'] * 10,
|
||||||
|
'Potency': [10.2, 10.5, 10.3, 10.8, 9.8, 10.6, 10.7, 10.2, 10.0, 10.6, 9.8, 9.6, 10.1, 10.2, 10.1, 9.7, 9.5, 9.6, 9.8, 9.9]
|
||||||
|
})
|
||||||
|
|
||||||
|
result = yli.ttest_ind(df, 'Potency', 'Type')
|
||||||
|
|
||||||
|
t_expected = 0.54/(0.285*np.sqrt(1/10+1/10))
|
||||||
|
|
||||||
|
assert result.statistic == approx(t_expected, abs=0.01)
|
||||||
|
assert result.dof == 18
|
||||||
|
assert result.delta.point == approx(0.54, abs=0.01)
|
||||||
|
assert result.delta.ci_lower == approx(0.272, abs=0.01)
|
||||||
|
assert result.delta.ci_upper == approx(0.808, abs=0.01)
|
105
yli/sig_tests.py
105
yli/sig_tests.py
@ -21,74 +21,10 @@ import statsmodels.api as sm
|
|||||||
import functools
|
import functools
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
def check_nan(df, nan_policy):
|
from .utils import Estimate, as_2groups, check_nan, fmt_p_html, fmt_p_text
|
||||||
"""Check df against nan_policy and return cleaned input"""
|
|
||||||
|
|
||||||
if nan_policy == 'raise':
|
# ----------------
|
||||||
if pd.isna(df).any(axis=None):
|
# Student's t test
|
||||||
raise ValueError('NaN in input, pass nan_policy="warn" or "omit" to ignore')
|
|
||||||
elif nan_policy == 'warn':
|
|
||||||
df_cleaned = df.dropna()
|
|
||||||
if len(df_cleaned) < len(df):
|
|
||||||
warnings.warn('Omitting {} rows with NaN'.format(len(df) - len(df_cleaned)))
|
|
||||||
return df_cleaned
|
|
||||||
elif nan_policy == 'omit':
|
|
||||||
return df.dropna()
|
|
||||||
else:
|
|
||||||
raise Exception('Invalid nan_policy, expected "raise", "warn" or "omit"')
|
|
||||||
|
|
||||||
def do_fmt_p(p):
|
|
||||||
"""Return sign and formatted p value"""
|
|
||||||
|
|
||||||
if p < 0.001:
|
|
||||||
return '<', '0.001*'
|
|
||||||
elif p < 0.0095:
|
|
||||||
return None, '{:.3f}*'.format(p)
|
|
||||||
elif p < 0.045:
|
|
||||||
return None, '{:.2f}*'.format(p)
|
|
||||||
elif p < 0.05:
|
|
||||||
return None, '{:.3f}*'.format(p) # 3dps to show significance
|
|
||||||
elif p < 0.055:
|
|
||||||
return None, '{:.3f}'.format(p) # 3dps to show non-significance
|
|
||||||
elif p < 0.095:
|
|
||||||
return None, '{:.2f}'.format(p)
|
|
||||||
else:
|
|
||||||
return None, '{:.1f}'.format(p)
|
|
||||||
|
|
||||||
def fmt_p_text(p, nospace=False):
|
|
||||||
"""Format p value for plaintext"""
|
|
||||||
|
|
||||||
sign, fmt = do_fmt_p(p)
|
|
||||||
if sign is not None:
|
|
||||||
if nospace:
|
|
||||||
return sign + fmt # e.g. "<0.001"
|
|
||||||
else:
|
|
||||||
return sign + ' ' + fmt # e.g. "< 0.001"
|
|
||||||
else:
|
|
||||||
if nospace:
|
|
||||||
return fmt # e.g. "0.05"
|
|
||||||
else:
|
|
||||||
return '= ' + fmt # e.g. "= 0.05"
|
|
||||||
|
|
||||||
def fmt_p_html(p, nospace=False):
|
|
||||||
"""Format p value for HTML"""
|
|
||||||
|
|
||||||
txt = fmt_p_text(p, nospace)
|
|
||||||
return txt.replace('<', '<')
|
|
||||||
|
|
||||||
class Estimate:
|
|
||||||
"""A point estimate and surrounding confidence interval"""
|
|
||||||
|
|
||||||
def __init__(self, point, ci_lower, ci_upper):
|
|
||||||
self.point = point
|
|
||||||
self.ci_lower = ci_lower
|
|
||||||
self.ci_upper = ci_upper
|
|
||||||
|
|
||||||
def _repr_html_(self):
|
|
||||||
return self.summary()
|
|
||||||
|
|
||||||
def summary(self):
|
|
||||||
return '{:.2f} ({:.2f}–{:.2f})'.format(self.point, self.ci_lower, self.ci_upper)
|
|
||||||
|
|
||||||
class TTestResult:
|
class TTestResult:
|
||||||
"""
|
"""
|
||||||
@ -97,44 +33,41 @@ class TTestResult:
|
|||||||
delta: Mean difference
|
delta: Mean difference
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, statistic, dof, pvalue, delta):
|
def __init__(self, statistic, dof, pvalue, delta, delta_direction):
|
||||||
self.statistic = statistic
|
self.statistic = statistic
|
||||||
self.dof = dof
|
self.dof = dof
|
||||||
self.pvalue = pvalue
|
self.pvalue = pvalue
|
||||||
self.delta = delta
|
self.delta = delta
|
||||||
|
self.delta_direction = delta_direction
|
||||||
|
|
||||||
def _repr_html_(self):
|
def _repr_html_(self):
|
||||||
return '<i>t</i>({:.0f}) = {:.2f}; <i>p</i> {}<br><i>δ</i> (95% CI) = {}'.format(self.dof, self.statistic, fmt_p_html(self.pvalue), self.delta.summary())
|
return '<i>t</i>({:.0f}) = {:.2f}; <i>p</i> {}<br><i>δ</i> (95% CI) = {}, {}'.format(self.dof, self.statistic, fmt_p_html(self.pvalue), self.delta.summary(), self.delta_direction)
|
||||||
|
|
||||||
def summary(self):
|
def summary(self):
|
||||||
return 't({:.0f}) = {:.2f}; p {}\nδ (95% CI) = {}'.format(self.dof, self.statistic, fmt_p_text(self.pvalue), self.delta.summary())
|
return 't({:.0f}) = {:.2f}; p {}\nδ (95% CI) = {}, {}'.format(self.dof, self.statistic, fmt_p_text(self.pvalue), self.delta.summary(), self.delta_direction)
|
||||||
|
|
||||||
def ttest_ind(df, dep, ind, *, nan_policy='warn'):
|
def ttest_ind(df, dep, ind, *, nan_policy='warn'):
|
||||||
"""Perform an independent-sample Student's t test"""
|
"""Perform an independent-sample Student's t test"""
|
||||||
|
|
||||||
|
# Check for/clean NaNs
|
||||||
df = check_nan(df[[ind, dep]], nan_policy)
|
df = check_nan(df[[ind, dep]], nan_policy)
|
||||||
|
|
||||||
# Get groupings for ind
|
# Ensure 2 groups for ind
|
||||||
groups = list(df.groupby(ind).groups.values())
|
group1, data1, group2, data2 = as_2groups(df, dep, ind)
|
||||||
|
|
||||||
# Ensure only 2 groups to compare
|
|
||||||
if len(groups) != 2:
|
|
||||||
raise Exception('Got {} values for {}, expected 2'.format(len(groups), ind))
|
|
||||||
|
|
||||||
# Get 2 groups
|
|
||||||
group1 = df.loc[groups[0], dep]
|
|
||||||
group2 = df.loc[groups[1], dep]
|
|
||||||
|
|
||||||
# Do t test
|
# Do t test
|
||||||
# Use statsmodels rather than SciPy because this provides the mean difference automatically
|
# Use statsmodels rather than SciPy because this provides the mean difference automatically
|
||||||
d1 = sm.stats.DescrStatsW(group1)
|
d1 = sm.stats.DescrStatsW(data1)
|
||||||
d2 = sm.stats.DescrStatsW(group2)
|
d2 = sm.stats.DescrStatsW(data2)
|
||||||
|
|
||||||
cm = sm.stats.CompareMeans(d2, d1) # This order to get correct CI
|
cm = sm.stats.CompareMeans(d1, d2)
|
||||||
statistic, pvalue, dof = cm.ttest_ind()
|
statistic, pvalue, dof = cm.ttest_ind()
|
||||||
|
|
||||||
delta = d2.mean - d1.mean
|
delta = d1.mean - d2.mean
|
||||||
ci0, ci1 = cm.tconfint_diff()
|
ci0, ci1 = cm.tconfint_diff()
|
||||||
|
|
||||||
return TTestResult(statistic=statistic, dof=dof, pvalue=pvalue, delta=Estimate(delta, ci0, ci1))
|
# t test is symmetric so take absolute values
|
||||||
0
|
return TTestResult(
|
||||||
|
statistic=abs(statistic), dof=dof, pvalue=pvalue,
|
||||||
|
delta=abs(Estimate(delta, ci0, ci1)),
|
||||||
|
delta_direction=('{0} > {1}' if d1.mean > d2.mean else '{1} > {0}').format(group1, group2))
|
||||||
|
119
yli/utils.py
Normal file
119
yli/utils.py
Normal file
@ -0,0 +1,119 @@
|
|||||||
|
# scipy-yli: Helpful SciPy utilities and recipes
|
||||||
|
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
|
||||||
|
#
|
||||||
|
# This program is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU Affero General Public License as published by
|
||||||
|
# the Free Software Foundation, either version 3 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU Affero General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Affero General Public License
|
||||||
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
def check_nan(df, nan_policy):
|
||||||
|
"""Check df against nan_policy and return cleaned input"""
|
||||||
|
|
||||||
|
if nan_policy == 'raise':
|
||||||
|
if pd.isna(df).any(axis=None):
|
||||||
|
raise ValueError('NaN in input, pass nan_policy="warn" or "omit" to ignore')
|
||||||
|
elif nan_policy == 'warn':
|
||||||
|
df_cleaned = df.dropna()
|
||||||
|
if len(df_cleaned) < len(df):
|
||||||
|
warnings.warn('Omitting {} rows with NaN'.format(len(df) - len(df_cleaned)))
|
||||||
|
return df_cleaned
|
||||||
|
elif nan_policy == 'omit':
|
||||||
|
return df.dropna()
|
||||||
|
else:
|
||||||
|
raise Exception('Invalid nan_policy, expected "raise", "warn" or "omit"')
|
||||||
|
|
||||||
|
def as_2groups(df, data, group):
|
||||||
|
"""Group the data by the given variable, ensuring only 2 groups"""
|
||||||
|
|
||||||
|
# Get groupings
|
||||||
|
groups = list(df.groupby(group).groups.items())
|
||||||
|
|
||||||
|
# Ensure only 2 groups to compare
|
||||||
|
if len(groups) != 2:
|
||||||
|
raise Exception('Got {} values for {}, expected 2'.format(len(groups), group))
|
||||||
|
|
||||||
|
# Get 2 groups
|
||||||
|
group1 = groups[0][0]
|
||||||
|
data1 = df.loc[groups[0][1], data]
|
||||||
|
group2 = groups[1][0]
|
||||||
|
data2 = df.loc[groups[1][1], data]
|
||||||
|
|
||||||
|
return group1, data1, group2, data2
|
||||||
|
|
||||||
|
def do_fmt_p(p):
|
||||||
|
"""Return sign and formatted p value"""
|
||||||
|
|
||||||
|
if p < 0.001:
|
||||||
|
return '<', '0.001*'
|
||||||
|
elif p < 0.0095:
|
||||||
|
return None, '{:.3f}*'.format(p)
|
||||||
|
elif p < 0.045:
|
||||||
|
return None, '{:.2f}*'.format(p)
|
||||||
|
elif p < 0.05:
|
||||||
|
return None, '{:.3f}*'.format(p) # 3dps to show significance
|
||||||
|
elif p < 0.055:
|
||||||
|
return None, '{:.3f}'.format(p) # 3dps to show non-significance
|
||||||
|
elif p < 0.095:
|
||||||
|
return None, '{:.2f}'.format(p)
|
||||||
|
else:
|
||||||
|
return None, '{:.1f}'.format(p)
|
||||||
|
|
||||||
|
def fmt_p_text(p, nospace=False):
|
||||||
|
"""Format p value for plaintext"""
|
||||||
|
|
||||||
|
sign, fmt = do_fmt_p(p)
|
||||||
|
if sign is not None:
|
||||||
|
if nospace:
|
||||||
|
return sign + fmt # e.g. "<0.001"
|
||||||
|
else:
|
||||||
|
return sign + ' ' + fmt # e.g. "< 0.001"
|
||||||
|
else:
|
||||||
|
if nospace:
|
||||||
|
return fmt # e.g. "0.05"
|
||||||
|
else:
|
||||||
|
return '= ' + fmt # e.g. "= 0.05"
|
||||||
|
|
||||||
|
def fmt_p_html(p, nospace=False):
|
||||||
|
"""Format p value for HTML"""
|
||||||
|
|
||||||
|
txt = fmt_p_text(p, nospace)
|
||||||
|
return txt.replace('<', '<')
|
||||||
|
|
||||||
|
class Estimate:
|
||||||
|
"""A point estimate and surrounding confidence interval"""
|
||||||
|
|
||||||
|
def __init__(self, point, ci_lower, ci_upper):
|
||||||
|
self.point = point
|
||||||
|
self.ci_lower = ci_lower
|
||||||
|
self.ci_upper = ci_upper
|
||||||
|
|
||||||
|
def _repr_html_(self):
|
||||||
|
return self.summary()
|
||||||
|
|
||||||
|
def summary(self):
|
||||||
|
return '{:.2f} ({:.2f}–{:.2f})'.format(self.point, self.ci_lower, self.ci_upper)
|
||||||
|
|
||||||
|
def __neg__(self):
|
||||||
|
return Estimate(-self.point, -self.ci_upper, -self.ci_lower)
|
||||||
|
|
||||||
|
def __abs__(self):
|
||||||
|
if self.point < 0:
|
||||||
|
return -self
|
||||||
|
else:
|
||||||
|
return self
|
||||||
|
|
||||||
|
def exp(self):
|
||||||
|
return Estimate(np.exp(self.point), np.exp(self.ci_lower), np.exp(self.ci_upper))
|
Loading…
Reference in New Issue
Block a user