Refactor and add test for correlation ratio (eta) in yli.auto_correlations

2024-05-16 23:11:43 +10:00 · 2024-05-16 23:11:43 +10:00 · b7a66849ff
commit b7a66849ff
parent 7d080f7d20
2 changed files with 32 additions and 18 deletions
--- a/tests/test_correlation.py
+++ b/tests/test_correlation.py
@ -1,5 +1,5 @@
 #   scipy-yli: Helpful SciPy utilities and recipes
-#   Copyright © 2022–2023  Lee Yingtong Li (RunasSudo)
+#   Copyright © 2022–2024  Lee Yingtong Li (RunasSudo)
 #
 #   This program is free software: you can redistribute it and/or modify
 #   it under the terms of the GNU Affero General Public License as published by
@ -16,6 +16,7 @@

 from pytest import approx

+import numpy as np
 import pandas as pd

 import yli
@ -66,3 +67,15 @@ def test_spearman_ol11_17():
 	expected_summary = 'ρ (95% CI) = 0.87 (0.60–0.96); p < 0.001*'  # NB: The confidence intervals are unvalidated
 	assert result.summary() == expected_summary
 	assert result._repr_html_() == '<i>ρ</i> (95% CI) = 0.87 (0.60–0.96); <i>p</i> &lt; 0.001*'
+
+def test_eta_wikipedia():
+	"""Compare _compute_eta, used in yli.auto_correlations, for https://en.wikipedia.org/w/index.php?title=Correlation_ratio&oldid=1203268770#Example"""
+	
+	df = pd.DataFrame({
+		'Subject': ['Algebra'] * 5 + ['Geometry'] * 4 + ['Statistics'] * 6,
+		'Score': [45, 70, 29, 15, 21, 40, 20, 30, 42, 65, 95, 80, 70, 85, 73]
+	})
+	
+	result = yli.descriptives._compute_eta(df, 'Subject', 'Score')
+	
+	assert result == np.sqrt(6780/9640)
--- a/yli/descriptives.py
+++ b/yli/descriptives.py
@ -238,32 +238,18 @@ def auto_correlations(df, cols):
 				else:
 					# Categorical-nominal, etc.
 					# Compute eta
-					ssw = 0
-					ssb = 0
-					values_mean = df_2cols[col2].astype('float64').mean()
-					for category in df_2cols[col1].unique():
-						subgroup = df_2cols[df_2cols[col1] == category][col2].astype('float64')
-						ssw += ((subgroup - subgroup.mean())**2).sum()
-						ssb += len(subgroup) * (subgroup.mean() - values_mean)**2
-					statistic = (ssb / (ssb + ssw))**0.5
+					statistic = _compute_eta(df_2cols, col1, col2)
 					df_corr.loc[col1, col2] = statistic
 					df_corr.loc[col2, col1] = statistic
 			else:
 				if col2 in categorical_columns and len(df_coded[col2].unique()) > 2:
 					# Categorical-nominal, etc.
 					# Compute eta
-					ssw = 0
-					ssb = 0
-					values_mean = df_2cols[col1].astype('float64').mean()
-					for category in df_2cols[col2].unique():
-						subgroup = df_2cols[df_2cols[col2] == category][col1].astype('float64')
-						ssw += ((subgroup - subgroup.mean())**2).sum()
-						ssb += len(subgroup) * (subgroup.mean() - values_mean)**2
-					statistic = (ssb / (ssb + ssw))**0.5
+					statistic = _compute_eta(df_2cols, col2, col1)
 					df_corr.loc[col1, col2] = statistic
 					df_corr.loc[col2, col1] = statistic
 				else:
-					# Nominal-nominal, etc.
+					# Continuous-continuous, etc.
 					# Compute Pearson r (or Spearman rho, point-biserial, etc.)
 					statistic = stats.pearsonr(df_2cols[col1], df_2cols[col2]).statistic
 					df_corr.loc[col1, col2] = statistic
@ -274,6 +260,21 @@ def auto_correlations(df, cols):
 	
 	return AutoCorrelationsResult(df_corr)

+def _compute_eta(df, col_category, col_numeric):
+	"""
+	Compute the correlation ratio, *η*
+	"""
+	
+	ssw = 0
+	ssb = 0
+	values_mean = df[col_numeric].astype('float64').mean()
+	for category in df[col_category].unique():
+		subgroup = df[df[col_category] == category][col_numeric].astype('float64')
+		ssw += ((subgroup - subgroup.mean())**2).sum()
+		ssb += len(subgroup) * (subgroup.mean() - values_mean)**2
+	statistic = (ssb / (ssb + ssw))**0.5
+	return statistic
+
 class AutoCorrelationsResult:
 	"""
 	Result of automatically computed pairwise correlation coefficients