From b7a66849ff370a70e3cbd13267dc2f670dc27c4f Mon Sep 17 00:00:00 2001 From: RunasSudo Date: Thu, 16 May 2024 23:11:43 +1000 Subject: [PATCH] Refactor and add test for correlation ratio (eta) in yli.auto_correlations --- tests/test_correlation.py | 15 ++++++++++++++- yli/descriptives.py | 35 ++++++++++++++++++----------------- 2 files changed, 32 insertions(+), 18 deletions(-) diff --git a/tests/test_correlation.py b/tests/test_correlation.py index d9fe328..b2a9976 100644 --- a/tests/test_correlation.py +++ b/tests/test_correlation.py @@ -1,5 +1,5 @@ # scipy-yli: Helpful SciPy utilities and recipes -# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo) +# Copyright © 2022–2024 Lee Yingtong Li (RunasSudo) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by @@ -16,6 +16,7 @@ from pytest import approx +import numpy as np import pandas as pd import yli @@ -66,3 +67,15 @@ def test_spearman_ol11_17(): expected_summary = 'ρ (95% CI) = 0.87 (0.60–0.96); p < 0.001*' # NB: The confidence intervals are unvalidated assert result.summary() == expected_summary assert result._repr_html_() == 'ρ (95% CI) = 0.87 (0.60–0.96); p < 0.001*' + +def test_eta_wikipedia(): + """Compare _compute_eta, used in yli.auto_correlations, for https://en.wikipedia.org/w/index.php?title=Correlation_ratio&oldid=1203268770#Example""" + + df = pd.DataFrame({ + 'Subject': ['Algebra'] * 5 + ['Geometry'] * 4 + ['Statistics'] * 6, + 'Score': [45, 70, 29, 15, 21, 40, 20, 30, 42, 65, 95, 80, 70, 85, 73] + }) + + result = yli.descriptives._compute_eta(df, 'Subject', 'Score') + + assert result == np.sqrt(6780/9640) diff --git a/yli/descriptives.py b/yli/descriptives.py index eabe30d..b4cdac2 100644 --- a/yli/descriptives.py +++ b/yli/descriptives.py @@ -238,32 +238,18 @@ def auto_correlations(df, cols): else: # Categorical-nominal, etc. # Compute eta - ssw = 0 - ssb = 0 - values_mean = df_2cols[col2].astype('float64').mean() - for category in df_2cols[col1].unique(): - subgroup = df_2cols[df_2cols[col1] == category][col2].astype('float64') - ssw += ((subgroup - subgroup.mean())**2).sum() - ssb += len(subgroup) * (subgroup.mean() - values_mean)**2 - statistic = (ssb / (ssb + ssw))**0.5 + statistic = _compute_eta(df_2cols, col1, col2) df_corr.loc[col1, col2] = statistic df_corr.loc[col2, col1] = statistic else: if col2 in categorical_columns and len(df_coded[col2].unique()) > 2: # Categorical-nominal, etc. # Compute eta - ssw = 0 - ssb = 0 - values_mean = df_2cols[col1].astype('float64').mean() - for category in df_2cols[col2].unique(): - subgroup = df_2cols[df_2cols[col2] == category][col1].astype('float64') - ssw += ((subgroup - subgroup.mean())**2).sum() - ssb += len(subgroup) * (subgroup.mean() - values_mean)**2 - statistic = (ssb / (ssb + ssw))**0.5 + statistic = _compute_eta(df_2cols, col2, col1) df_corr.loc[col1, col2] = statistic df_corr.loc[col2, col1] = statistic else: - # Nominal-nominal, etc. + # Continuous-continuous, etc. # Compute Pearson r (or Spearman rho, point-biserial, etc.) statistic = stats.pearsonr(df_2cols[col1], df_2cols[col2]).statistic df_corr.loc[col1, col2] = statistic @@ -274,6 +260,21 @@ def auto_correlations(df, cols): return AutoCorrelationsResult(df_corr) +def _compute_eta(df, col_category, col_numeric): + """ + Compute the correlation ratio, *η* + """ + + ssw = 0 + ssb = 0 + values_mean = df[col_numeric].astype('float64').mean() + for category in df[col_category].unique(): + subgroup = df[df[col_category] == category][col_numeric].astype('float64') + ssw += ((subgroup - subgroup.mean())**2).sum() + ssb += len(subgroup) * (subgroup.mean() - values_mean)**2 + statistic = (ssb / (ssb + ssw))**0.5 + return statistic + class AutoCorrelationsResult: """ Result of automatically computed pairwise correlation coefficients