Implement ttest_ind

This commit is contained in:
RunasSudo 2022-10-13 12:53:18 +11:00
parent 97a01b4e80
commit 6b43034a50
Signed by: RunasSudo
GPG Key ID: 7234E476BF21C61A
3 changed files with 178 additions and 86 deletions

40
tests/test_ttest.py Normal file
View File

@ -0,0 +1,40 @@
# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
from pytest import approx
import numpy as np
import pandas as pd
import yli
def test_ttest_ind_ol6_1():
"""Compare yli.ttest_ind for Ott & Longnecker (2016) example 6.1"""
df = pd.DataFrame({
'Type': ['Fresh'] * 10 + ['Stored'] * 10,
'Potency': [10.2, 10.5, 10.3, 10.8, 9.8, 10.6, 10.7, 10.2, 10.0, 10.6, 9.8, 9.6, 10.1, 10.2, 10.1, 9.7, 9.5, 9.6, 9.8, 9.9]
})
result = yli.ttest_ind(df, 'Potency', 'Type')
t_expected = 0.54/(0.285*np.sqrt(1/10+1/10))
assert result.statistic == approx(t_expected, abs=0.01)
assert result.dof == 18
assert result.delta.point == approx(0.54, abs=0.01)
assert result.delta.ci_lower == approx(0.272, abs=0.01)
assert result.delta.ci_upper == approx(0.808, abs=0.01)

View File

@ -21,74 +21,10 @@ import statsmodels.api as sm
import functools import functools
import warnings import warnings
def check_nan(df, nan_policy): from .utils import Estimate, as_2groups, check_nan, fmt_p_html, fmt_p_text
"""Check df against nan_policy and return cleaned input"""
if nan_policy == 'raise': # ----------------
if pd.isna(df).any(axis=None): # Student's t test
raise ValueError('NaN in input, pass nan_policy="warn" or "omit" to ignore')
elif nan_policy == 'warn':
df_cleaned = df.dropna()
if len(df_cleaned) < len(df):
warnings.warn('Omitting {} rows with NaN'.format(len(df) - len(df_cleaned)))
return df_cleaned
elif nan_policy == 'omit':
return df.dropna()
else:
raise Exception('Invalid nan_policy, expected "raise", "warn" or "omit"')
def do_fmt_p(p):
"""Return sign and formatted p value"""
if p < 0.001:
return '<', '0.001*'
elif p < 0.0095:
return None, '{:.3f}*'.format(p)
elif p < 0.045:
return None, '{:.2f}*'.format(p)
elif p < 0.05:
return None, '{:.3f}*'.format(p) # 3dps to show significance
elif p < 0.055:
return None, '{:.3f}'.format(p) # 3dps to show non-significance
elif p < 0.095:
return None, '{:.2f}'.format(p)
else:
return None, '{:.1f}'.format(p)
def fmt_p_text(p, nospace=False):
"""Format p value for plaintext"""
sign, fmt = do_fmt_p(p)
if sign is not None:
if nospace:
return sign + fmt # e.g. "<0.001"
else:
return sign + ' ' + fmt # e.g. "< 0.001"
else:
if nospace:
return fmt # e.g. "0.05"
else:
return '= ' + fmt # e.g. "= 0.05"
def fmt_p_html(p, nospace=False):
"""Format p value for HTML"""
txt = fmt_p_text(p, nospace)
return txt.replace('<', '&lt;')
class Estimate:
"""A point estimate and surrounding confidence interval"""
def __init__(self, point, ci_lower, ci_upper):
self.point = point
self.ci_lower = ci_lower
self.ci_upper = ci_upper
def _repr_html_(self):
return self.summary()
def summary(self):
return '{:.2f} ({:.2f}{:.2f})'.format(self.point, self.ci_lower, self.ci_upper)
class TTestResult: class TTestResult:
""" """
@ -97,44 +33,41 @@ class TTestResult:
delta: Mean difference delta: Mean difference
""" """
def __init__(self, statistic, dof, pvalue, delta): def __init__(self, statistic, dof, pvalue, delta, delta_direction):
self.statistic = statistic self.statistic = statistic
self.dof = dof self.dof = dof
self.pvalue = pvalue self.pvalue = pvalue
self.delta = delta self.delta = delta
self.delta_direction = delta_direction
def _repr_html_(self): def _repr_html_(self):
return '<i>t</i>({:.0f}) = {:.2f}; <i>p</i> {}<br><i>δ</i> (95% CI) = {}'.format(self.dof, self.statistic, fmt_p_html(self.pvalue), self.delta.summary()) return '<i>t</i>({:.0f}) = {:.2f}; <i>p</i> {}<br><i>δ</i> (95% CI) = {}, {}'.format(self.dof, self.statistic, fmt_p_html(self.pvalue), self.delta.summary(), self.delta_direction)
def summary(self): def summary(self):
return 't({:.0f}) = {:.2f}; p {}\nδ (95% CI) = {}'.format(self.dof, self.statistic, fmt_p_text(self.pvalue), self.delta.summary()) return 't({:.0f}) = {:.2f}; p {}\nδ (95% CI) = {}, {}'.format(self.dof, self.statistic, fmt_p_text(self.pvalue), self.delta.summary(), self.delta_direction)
def ttest_ind(df, dep, ind, *, nan_policy='warn'): def ttest_ind(df, dep, ind, *, nan_policy='warn'):
"""Perform an independent-sample Student's t test""" """Perform an independent-sample Student's t test"""
# Check for/clean NaNs
df = check_nan(df[[ind, dep]], nan_policy) df = check_nan(df[[ind, dep]], nan_policy)
# Get groupings for ind # Ensure 2 groups for ind
groups = list(df.groupby(ind).groups.values()) group1, data1, group2, data2 = as_2groups(df, dep, ind)
# Ensure only 2 groups to compare
if len(groups) != 2:
raise Exception('Got {} values for {}, expected 2'.format(len(groups), ind))
# Get 2 groups
group1 = df.loc[groups[0], dep]
group2 = df.loc[groups[1], dep]
# Do t test # Do t test
# Use statsmodels rather than SciPy because this provides the mean difference automatically # Use statsmodels rather than SciPy because this provides the mean difference automatically
d1 = sm.stats.DescrStatsW(group1) d1 = sm.stats.DescrStatsW(data1)
d2 = sm.stats.DescrStatsW(group2) d2 = sm.stats.DescrStatsW(data2)
cm = sm.stats.CompareMeans(d2, d1) # This order to get correct CI cm = sm.stats.CompareMeans(d1, d2)
statistic, pvalue, dof = cm.ttest_ind() statistic, pvalue, dof = cm.ttest_ind()
delta = d2.mean - d1.mean delta = d1.mean - d2.mean
ci0, ci1 = cm.tconfint_diff() ci0, ci1 = cm.tconfint_diff()
return TTestResult(statistic=statistic, dof=dof, pvalue=pvalue, delta=Estimate(delta, ci0, ci1)) # t test is symmetric so take absolute values
0 return TTestResult(
statistic=abs(statistic), dof=dof, pvalue=pvalue,
delta=abs(Estimate(delta, ci0, ci1)),
delta_direction=('{0} > {1}' if d1.mean > d2.mean else '{1} > {0}').format(group1, group2))

119
yli/utils.py Normal file
View File

@ -0,0 +1,119 @@
# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import numpy as np
import pandas as pd
import warnings
def check_nan(df, nan_policy):
"""Check df against nan_policy and return cleaned input"""
if nan_policy == 'raise':
if pd.isna(df).any(axis=None):
raise ValueError('NaN in input, pass nan_policy="warn" or "omit" to ignore')
elif nan_policy == 'warn':
df_cleaned = df.dropna()
if len(df_cleaned) < len(df):
warnings.warn('Omitting {} rows with NaN'.format(len(df) - len(df_cleaned)))
return df_cleaned
elif nan_policy == 'omit':
return df.dropna()
else:
raise Exception('Invalid nan_policy, expected "raise", "warn" or "omit"')
def as_2groups(df, data, group):
"""Group the data by the given variable, ensuring only 2 groups"""
# Get groupings
groups = list(df.groupby(group).groups.items())
# Ensure only 2 groups to compare
if len(groups) != 2:
raise Exception('Got {} values for {}, expected 2'.format(len(groups), group))
# Get 2 groups
group1 = groups[0][0]
data1 = df.loc[groups[0][1], data]
group2 = groups[1][0]
data2 = df.loc[groups[1][1], data]
return group1, data1, group2, data2
def do_fmt_p(p):
"""Return sign and formatted p value"""
if p < 0.001:
return '<', '0.001*'
elif p < 0.0095:
return None, '{:.3f}*'.format(p)
elif p < 0.045:
return None, '{:.2f}*'.format(p)
elif p < 0.05:
return None, '{:.3f}*'.format(p) # 3dps to show significance
elif p < 0.055:
return None, '{:.3f}'.format(p) # 3dps to show non-significance
elif p < 0.095:
return None, '{:.2f}'.format(p)
else:
return None, '{:.1f}'.format(p)
def fmt_p_text(p, nospace=False):
"""Format p value for plaintext"""
sign, fmt = do_fmt_p(p)
if sign is not None:
if nospace:
return sign + fmt # e.g. "<0.001"
else:
return sign + ' ' + fmt # e.g. "< 0.001"
else:
if nospace:
return fmt # e.g. "0.05"
else:
return '= ' + fmt # e.g. "= 0.05"
def fmt_p_html(p, nospace=False):
"""Format p value for HTML"""
txt = fmt_p_text(p, nospace)
return txt.replace('<', '&lt;')
class Estimate:
"""A point estimate and surrounding confidence interval"""
def __init__(self, point, ci_lower, ci_upper):
self.point = point
self.ci_lower = ci_lower
self.ci_upper = ci_upper
def _repr_html_(self):
return self.summary()
def summary(self):
return '{:.2f} ({:.2f}{:.2f})'.format(self.point, self.ci_lower, self.ci_upper)
def __neg__(self):
return Estimate(-self.point, -self.ci_upper, -self.ci_lower)
def __abs__(self):
if self.point < 0:
return -self
else:
return self
def exp(self):
return Estimate(np.exp(self.point), np.exp(self.ci_lower), np.exp(self.ci_upper))