From e8e97f60735ee722e6528babd28c60bc99f5a2dc Mon Sep 17 00:00:00 2001 From: RunasSudo Date: Fri, 14 Oct 2022 20:18:25 +1100 Subject: [PATCH] Implement anova_oneway --- tests/test_anova.py | 52 ++++++++++++++++++++++++++++++++++++ yli/__init__.py | 2 +- yli/regress.py | 16 +----------- yli/sig_tests.py | 64 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 118 insertions(+), 16 deletions(-) create mode 100644 tests/test_anova.py diff --git a/tests/test_anova.py b/tests/test_anova.py new file mode 100644 index 0000000..d533e0b --- /dev/null +++ b/tests/test_anova.py @@ -0,0 +1,52 @@ +# scipy-yli: Helpful SciPy utilities and recipes +# Copyright © 2022 Lee Yingtong Li (RunasSudo) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from pytest import approx + +import pandas as pd +import statsmodels.api as sm + +import yli + +def test_anova_oneway_ol8_2(): + """Compare yli.anova_oneway for Ott & Longnecker (2016) example 8.2""" + + df = pd.DataFrame({ + 'Method': [1]*8 + [2]*7 + [3]*9, + 'Score': [96, 79, 91, 85, 83, 91, 82, 87, 77, 76, 74, 73, 78, 71, 80, 66, 73, 69, 66, 77, 73, 71, 70, 74] + }) + + result = yli.anova_oneway(df, 'Score', 'Method') + + assert result.statistic == approx(545.316/18.4366, rel=0.001) + assert result.dof1 == 2 + assert result.dof2 == 21 + assert result.pvalue < 0.001 + +def test_regress_ftest_ol8_2(): + """Compare ANOVA via yli.regress for Ott & Longnecker (2016) example 8.2""" + + df = pd.DataFrame({ + 'Method': [1]*8 + [2]*7 + [3]*9, + 'Score': [96, 79, 91, 85, 83, 91, 82, 87, 77, 76, 74, 73, 78, 71, 80, 66, 73, 69, 66, 77, 73, 71, 70, 74] + }) + + result = yli.regress(sm.OLS, df, 'Score', 'C(Method)').ftest() + + assert result.statistic == approx(545.316/18.4366, rel=0.001) + assert result.dof1 == 2 + assert result.dof2 == 21 + assert result.pvalue < 0.001 diff --git a/yli/__init__.py b/yli/__init__.py index e92de2e..ee0d36a 100644 --- a/yli/__init__.py +++ b/yli/__init__.py @@ -18,7 +18,7 @@ from .bayes_factors import bayesfactor_afbf from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist from .fs import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted from .regress import PenalisedLogit, regress, vif -from .sig_tests import chi2, mannwhitney, ttest_ind +from .sig_tests import anova_oneway, chi2, mannwhitney, pearsonr, ttest_ind def reload_me(): import importlib diff --git a/yli/regress.py b/yli/regress.py index 103cf5c..6d6dea4 100644 --- a/yli/regress.py +++ b/yli/regress.py @@ -27,6 +27,7 @@ from datetime import datetime import itertools from .bayes_factors import BayesFactor, bayesfactor_afbf +from .sig_tests import FTestResult from .utils import Estimate, check_nan, fmt_p def vif(df, formula=None, nan_policy='warn'): @@ -91,21 +92,6 @@ class LikelihoodRatioTestResult: def summary(self): return 'LR({}) = {:.2f}; p {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, html=False)) -class FTestResult: - """Result of an F test for regression""" - - def __init__(self, statistic, dof_model, dof_resid, pvalue): - self.statistic = statistic - self.dof_model = dof_model - self.dof_resid = dof_resid - self.pvalue = pvalue - - def _repr_html_(self): - return 'F({}, {}) = {:.2f}; p {}'.format(self.dof_model, self.dof_resid, self.statistic, fmt_p(self.pvalue, html=True)) - - def summary(self): - return 'F({}, {}) = {:.2f}; p {}'.format(self.dof_model, self.dof_resid, self.statistic, fmt_p(self.pvalue, html=False)) - class RegressionResult: """ Result of a regression diff --git a/yli/sig_tests.py b/yli/sig_tests.py index dba9cd1..ed4992e 100644 --- a/yli/sig_tests.py +++ b/yli/sig_tests.py @@ -73,6 +73,42 @@ def ttest_ind(df, dep, ind, *, nan_policy='warn'): delta=abs(Estimate(delta, ci0, ci1)), delta_direction=('{0} > {1}' if d1.mean > d2.mean else '{1} > {0}').format(group1, group2)) +# ------------- +# One-way ANOVA + +class FTestResult: + """Result of an F test for ANOVA/regression""" + + def __init__(self, statistic, dof1, dof2, pvalue): + self.statistic = statistic + self.dof1 = dof1 + self.dof2 = dof2 + self.pvalue = pvalue + + def _repr_html_(self): + return 'F({}, {}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=True)) + + def summary(self): + return 'F({}, {}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=False)) + +def anova_oneway(df, dep, ind, *, nan_policy='omit'): + """Perform one-way ANOVA""" + + # Check for/clean NaNs + df = check_nan(df[[ind, dep]], nan_policy) + + # Group by independent variable + groups = df.groupby(ind)[dep] + + # Perform one-way ANOVA + result = stats.f_oneway(*[groups.get_group(k) for k in groups.groups]) + + # See stats.f_oneway implementation + dfbn = len(groups.groups) - 1 + dfwn = len(df) - len(groups.groups) + + return FTestResult(result.statistic, dfbn, dfwn, result.pvalue) + # ----------------- # Mann-Whitney test @@ -224,3 +260,31 @@ def chi2(df, dep, ind, *, nan_policy='warn'): result = stats.chi2_contingency(ct, correction=False) return PearsonChiSquaredResult(ct=ct, statistic=result[0], dof=result[2], pvalue=result[1]) + +# ------------------- +# Pearson correlation + +class PearsonRResult: + """Result of Pearson correlation""" + + def __init__(self, statistic, pvalue): + self.statistic = statistic + self.pvalue = pvalue + + def _repr_html_(self): + return 'r (95% CI) = {}; p {}'.format(self.statistic.summary(), fmt_p(self.pvalue, html=True)) + + def summary(self): + return 'r (95% CI) = {}; p {}'.format(self.statistic.summary(), fmt_p(self.pvalue, html=False)) + +def pearsonr(df, dep, ind, *, nan_policy='warn'): + """Compute the Pearson correlation coefficient (Pearson's r)""" + + # Check for/clean NaNs + df = check_nan(df[[ind, dep]], nan_policy) + + # Compute Pearson's r + result = stats.pearsonr(df[ind], df[dep]) + ci = result.confidence_interval() + + return PearsonRResult(statistic=Estimate(result.statistic, ci.low, ci.high), pvalue=result.pvalue)