Implement anova_oneway

This commit is contained in:
RunasSudo 2022-10-14 20:18:25 +11:00
parent a753761675
commit e8e97f6073
Signed by: RunasSudo
GPG Key ID: 7234E476BF21C61A
4 changed files with 118 additions and 16 deletions

52
tests/test_anova.py Normal file
View File

@ -0,0 +1,52 @@
# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
from pytest import approx
import pandas as pd
import statsmodels.api as sm
import yli
def test_anova_oneway_ol8_2():
"""Compare yli.anova_oneway for Ott & Longnecker (2016) example 8.2"""
df = pd.DataFrame({
'Method': [1]*8 + [2]*7 + [3]*9,
'Score': [96, 79, 91, 85, 83, 91, 82, 87, 77, 76, 74, 73, 78, 71, 80, 66, 73, 69, 66, 77, 73, 71, 70, 74]
})
result = yli.anova_oneway(df, 'Score', 'Method')
assert result.statistic == approx(545.316/18.4366, rel=0.001)
assert result.dof1 == 2
assert result.dof2 == 21
assert result.pvalue < 0.001
def test_regress_ftest_ol8_2():
"""Compare ANOVA via yli.regress for Ott & Longnecker (2016) example 8.2"""
df = pd.DataFrame({
'Method': [1]*8 + [2]*7 + [3]*9,
'Score': [96, 79, 91, 85, 83, 91, 82, 87, 77, 76, 74, 73, 78, 71, 80, 66, 73, 69, 66, 77, 73, 71, 70, 74]
})
result = yli.regress(sm.OLS, df, 'Score', 'C(Method)').ftest()
assert result.statistic == approx(545.316/18.4366, rel=0.001)
assert result.dof1 == 2
assert result.dof2 == 21
assert result.pvalue < 0.001

View File

@ -18,7 +18,7 @@ from .bayes_factors import bayesfactor_afbf
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
from .fs import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted from .fs import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
from .regress import PenalisedLogit, regress, vif from .regress import PenalisedLogit, regress, vif
from .sig_tests import chi2, mannwhitney, ttest_ind from .sig_tests import anova_oneway, chi2, mannwhitney, pearsonr, ttest_ind
def reload_me(): def reload_me():
import importlib import importlib

View File

@ -27,6 +27,7 @@ from datetime import datetime
import itertools import itertools
from .bayes_factors import BayesFactor, bayesfactor_afbf from .bayes_factors import BayesFactor, bayesfactor_afbf
from .sig_tests import FTestResult
from .utils import Estimate, check_nan, fmt_p from .utils import Estimate, check_nan, fmt_p
def vif(df, formula=None, nan_policy='warn'): def vif(df, formula=None, nan_policy='warn'):
@ -91,21 +92,6 @@ class LikelihoodRatioTestResult:
def summary(self): def summary(self):
return 'LR({}) = {:.2f}; p {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, html=False)) return 'LR({}) = {:.2f}; p {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, html=False))
class FTestResult:
"""Result of an F test for regression"""
def __init__(self, statistic, dof_model, dof_resid, pvalue):
self.statistic = statistic
self.dof_model = dof_model
self.dof_resid = dof_resid
self.pvalue = pvalue
def _repr_html_(self):
return '<i>F</i>({}, {}) = {:.2f}; <i>p</i> {}'.format(self.dof_model, self.dof_resid, self.statistic, fmt_p(self.pvalue, html=True))
def summary(self):
return 'F({}, {}) = {:.2f}; p {}'.format(self.dof_model, self.dof_resid, self.statistic, fmt_p(self.pvalue, html=False))
class RegressionResult: class RegressionResult:
""" """
Result of a regression Result of a regression

View File

@ -73,6 +73,42 @@ def ttest_ind(df, dep, ind, *, nan_policy='warn'):
delta=abs(Estimate(delta, ci0, ci1)), delta=abs(Estimate(delta, ci0, ci1)),
delta_direction=('{0} > {1}' if d1.mean > d2.mean else '{1} > {0}').format(group1, group2)) delta_direction=('{0} > {1}' if d1.mean > d2.mean else '{1} > {0}').format(group1, group2))
# -------------
# One-way ANOVA
class FTestResult:
"""Result of an F test for ANOVA/regression"""
def __init__(self, statistic, dof1, dof2, pvalue):
self.statistic = statistic
self.dof1 = dof1
self.dof2 = dof2
self.pvalue = pvalue
def _repr_html_(self):
return '<i>F</i>({}, {}) = {:.2f}; <i>p</i> {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=True))
def summary(self):
return 'F({}, {}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=False))
def anova_oneway(df, dep, ind, *, nan_policy='omit'):
"""Perform one-way ANOVA"""
# Check for/clean NaNs
df = check_nan(df[[ind, dep]], nan_policy)
# Group by independent variable
groups = df.groupby(ind)[dep]
# Perform one-way ANOVA
result = stats.f_oneway(*[groups.get_group(k) for k in groups.groups])
# See stats.f_oneway implementation
dfbn = len(groups.groups) - 1
dfwn = len(df) - len(groups.groups)
return FTestResult(result.statistic, dfbn, dfwn, result.pvalue)
# ----------------- # -----------------
# Mann-Whitney test # Mann-Whitney test
@ -224,3 +260,31 @@ def chi2(df, dep, ind, *, nan_policy='warn'):
result = stats.chi2_contingency(ct, correction=False) result = stats.chi2_contingency(ct, correction=False)
return PearsonChiSquaredResult(ct=ct, statistic=result[0], dof=result[2], pvalue=result[1]) return PearsonChiSquaredResult(ct=ct, statistic=result[0], dof=result[2], pvalue=result[1])
# -------------------
# Pearson correlation
class PearsonRResult:
"""Result of Pearson correlation"""
def __init__(self, statistic, pvalue):
self.statistic = statistic
self.pvalue = pvalue
def _repr_html_(self):
return '<i>r</i> (95% CI) = {}; <i>p</i> {}'.format(self.statistic.summary(), fmt_p(self.pvalue, html=True))
def summary(self):
return 'r (95% CI) = {}; p {}'.format(self.statistic.summary(), fmt_p(self.pvalue, html=False))
def pearsonr(df, dep, ind, *, nan_policy='warn'):
"""Compute the Pearson correlation coefficient (Pearson's r)"""
# Check for/clean NaNs
df = check_nan(df[[ind, dep]], nan_policy)
# Compute Pearson's r
result = stats.pearsonr(df[ind], df[dep])
ci = result.confidence_interval()
return PearsonRResult(statistic=Estimate(result.statistic, ci.low, ci.high), pvalue=result.pvalue)