From e8e97f60735ee722e6528babd28c60bc99f5a2dc Mon Sep 17 00:00:00 2001
From: RunasSudo <runassudo@yingtongli.me>
Date: Fri, 14 Oct 2022 20:18:25 +1100
Subject: [PATCH] Implement anova_oneway

---
 tests/test_anova.py | 52 ++++++++++++++++++++++++++++++++++++
 yli/__init__.py     |  2 +-
 yli/regress.py      | 16 +-----------
 yli/sig_tests.py    | 64 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 118 insertions(+), 16 deletions(-)
 create mode 100644 tests/test_anova.py
diff --git a/tests/test_anova.py b/tests/test_anova.py
new file mode 100644
index 0000000..d533e0b
--- /dev/null
+++ b/tests/test_anova.py
@@ -0,0 +1,52 @@
+#   scipy-yli: Helpful SciPy utilities and recipes
+#   Copyright © 2022  Lee Yingtong Li (RunasSudo)
+#
+#   This program is free software: you can redistribute it and/or modify
+#   it under the terms of the GNU Affero General Public License as published by
+#   the Free Software Foundation, either version 3 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU Affero General Public License for more details.
+#
+#   You should have received a copy of the GNU Affero General Public License
+#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from pytest import approx
+
+import pandas as pd
+import statsmodels.api as sm
+
+import yli
+
+def test_anova_oneway_ol8_2():
+	"""Compare yli.anova_oneway for Ott & Longnecker (2016) example 8.2"""
+	
+	df = pd.DataFrame({
+		'Method': [1]*8 + [2]*7 + [3]*9,
+		'Score': [96, 79, 91, 85, 83, 91, 82, 87, 77, 76, 74, 73, 78, 71, 80, 66, 73, 69, 66, 77, 73, 71, 70, 74]
+	})
+	
+	result = yli.anova_oneway(df, 'Score', 'Method')
+	
+	assert result.statistic == approx(545.316/18.4366, rel=0.001)
+	assert result.dof1 == 2
+	assert result.dof2 == 21
+	assert result.pvalue < 0.001
+
+def test_regress_ftest_ol8_2():
+	"""Compare ANOVA via yli.regress for Ott & Longnecker (2016) example 8.2"""
+	
+	df = pd.DataFrame({
+		'Method': [1]*8 + [2]*7 + [3]*9,
+		'Score': [96, 79, 91, 85, 83, 91, 82, 87, 77, 76, 74, 73, 78, 71, 80, 66, 73, 69, 66, 77, 73, 71, 70, 74]
+	})
+	
+	result = yli.regress(sm.OLS, df, 'Score', 'C(Method)').ftest()
+	
+	assert result.statistic == approx(545.316/18.4366, rel=0.001)
+	assert result.dof1 == 2
+	assert result.dof2 == 21
+	assert result.pvalue < 0.001
diff --git a/yli/__init__.py b/yli/__init__.py
index e92de2e..ee0d36a 100644
--- a/yli/__init__.py
+++ b/yli/__init__.py
@@ -18,7 +18,7 @@ from .bayes_factors import bayesfactor_afbf
 from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
 from .fs import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
 from .regress import PenalisedLogit, regress, vif
-from .sig_tests import chi2, mannwhitney, ttest_ind
+from .sig_tests import anova_oneway, chi2, mannwhitney, pearsonr, ttest_ind
 
 def reload_me():
 	import importlib
diff --git a/yli/regress.py b/yli/regress.py
index 103cf5c..6d6dea4 100644
--- a/yli/regress.py
+++ b/yli/regress.py
@@ -27,6 +27,7 @@ from datetime import datetime
 import itertools
 
 from .bayes_factors import BayesFactor, bayesfactor_afbf
+from .sig_tests import FTestResult
 from .utils import Estimate, check_nan, fmt_p
 
 def vif(df, formula=None, nan_policy='warn'):
@@ -91,21 +92,6 @@ class LikelihoodRatioTestResult:
 	def summary(self):
 		return 'LR({}) = {:.2f}; p {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, html=False))
 
-class FTestResult:
-	"""Result of an F test for regression"""
-	
-	def __init__(self, statistic, dof_model, dof_resid, pvalue):
-		self.statistic = statistic
-		self.dof_model = dof_model
-		self.dof_resid = dof_resid
-		self.pvalue = pvalue
-	
-	def _repr_html_(self):
-		return '<i>F</i>({}, {}) = {:.2f}; <i>p</i> {}'.format(self.dof_model, self.dof_resid, self.statistic, fmt_p(self.pvalue, html=True))
-	
-	def summary(self):
-		return 'F({}, {}) = {:.2f}; p {}'.format(self.dof_model, self.dof_resid, self.statistic, fmt_p(self.pvalue, html=False))
-
 class RegressionResult:
 	"""
 	Result of a regression
diff --git a/yli/sig_tests.py b/yli/sig_tests.py
index dba9cd1..ed4992e 100644
--- a/yli/sig_tests.py
+++ b/yli/sig_tests.py
@@ -73,6 +73,42 @@ def ttest_ind(df, dep, ind, *, nan_policy='warn'):
 		delta=abs(Estimate(delta, ci0, ci1)),
 		delta_direction=('{0} > {1}' if d1.mean > d2.mean else '{1} > {0}').format(group1, group2))
 
+# -------------
+# One-way ANOVA
+
+class FTestResult:
+	"""Result of an F test for ANOVA/regression"""
+	
+	def __init__(self, statistic, dof1, dof2, pvalue):
+		self.statistic = statistic
+		self.dof1 = dof1
+		self.dof2 = dof2
+		self.pvalue = pvalue
+	
+	def _repr_html_(self):
+		return '<i>F</i>({}, {}) = {:.2f}; <i>p</i> {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=True))
+	
+	def summary(self):
+		return 'F({}, {}) = {:.2f}; p {}'.format(self.dof1, self.dof2, self.statistic, fmt_p(self.pvalue, html=False))
+
+def anova_oneway(df, dep, ind, *, nan_policy='omit'):
+	"""Perform one-way ANOVA"""
+	
+	# Check for/clean NaNs
+	df = check_nan(df[[ind, dep]], nan_policy)
+	
+	# Group by independent variable
+	groups = df.groupby(ind)[dep]
+	
+	# Perform one-way ANOVA
+	result = stats.f_oneway(*[groups.get_group(k) for k in groups.groups])
+	
+	# See stats.f_oneway implementation
+	dfbn = len(groups.groups) - 1
+	dfwn = len(df) - len(groups.groups)
+	
+	return FTestResult(result.statistic, dfbn, dfwn, result.pvalue)
+
 # -----------------
 # Mann-Whitney test
 
@@ -224,3 +260,31 @@ def chi2(df, dep, ind, *, nan_policy='warn'):
 		result = stats.chi2_contingency(ct, correction=False)
 		
 		return PearsonChiSquaredResult(ct=ct, statistic=result[0], dof=result[2], pvalue=result[1])
+
+# -------------------
+# Pearson correlation
+
+class PearsonRResult:
+	"""Result of Pearson correlation"""
+	
+	def __init__(self, statistic, pvalue):
+		self.statistic = statistic
+		self.pvalue = pvalue
+	
+	def _repr_html_(self):
+		return '<i>r</i> (95% CI) = {}; <i>p</i> {}'.format(self.statistic.summary(), fmt_p(self.pvalue, html=True))
+	
+	def summary(self):
+		return 'r (95% CI) = {}; p {}'.format(self.statistic.summary(), fmt_p(self.pvalue, html=False))
+
+def pearsonr(df, dep, ind, *, nan_policy='warn'):
+	"""Compute the Pearson correlation coefficient (Pearson's r)"""
+	
+	# Check for/clean NaNs
+	df = check_nan(df[[ind, dep]], nan_policy)
+	
+	# Compute Pearson's r
+	result = stats.pearsonr(df[ind], df[dep])
+	ci = result.confidence_interval()
+	
+	return PearsonRResult(statistic=Estimate(result.statistic, ci.low, ci.high), pvalue=result.pvalue)