Refactor and add test for correlation ratio (eta) in yli.auto_correlations
This commit is contained in:
parent
7d080f7d20
commit
b7a66849ff
@ -1,5 +1,5 @@
|
|||||||
# scipy-yli: Helpful SciPy utilities and recipes
|
# scipy-yli: Helpful SciPy utilities and recipes
|
||||||
# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
|
# Copyright © 2022–2024 Lee Yingtong Li (RunasSudo)
|
||||||
#
|
#
|
||||||
# This program is free software: you can redistribute it and/or modify
|
# This program is free software: you can redistribute it and/or modify
|
||||||
# it under the terms of the GNU Affero General Public License as published by
|
# it under the terms of the GNU Affero General Public License as published by
|
||||||
@ -16,6 +16,7 @@
|
|||||||
|
|
||||||
from pytest import approx
|
from pytest import approx
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
import yli
|
import yli
|
||||||
@ -66,3 +67,15 @@ def test_spearman_ol11_17():
|
|||||||
expected_summary = 'ρ (95% CI) = 0.87 (0.60–0.96); p < 0.001*' # NB: The confidence intervals are unvalidated
|
expected_summary = 'ρ (95% CI) = 0.87 (0.60–0.96); p < 0.001*' # NB: The confidence intervals are unvalidated
|
||||||
assert result.summary() == expected_summary
|
assert result.summary() == expected_summary
|
||||||
assert result._repr_html_() == '<i>ρ</i> (95% CI) = 0.87 (0.60–0.96); <i>p</i> < 0.001*'
|
assert result._repr_html_() == '<i>ρ</i> (95% CI) = 0.87 (0.60–0.96); <i>p</i> < 0.001*'
|
||||||
|
|
||||||
|
def test_eta_wikipedia():
|
||||||
|
"""Compare _compute_eta, used in yli.auto_correlations, for https://en.wikipedia.org/w/index.php?title=Correlation_ratio&oldid=1203268770#Example"""
|
||||||
|
|
||||||
|
df = pd.DataFrame({
|
||||||
|
'Subject': ['Algebra'] * 5 + ['Geometry'] * 4 + ['Statistics'] * 6,
|
||||||
|
'Score': [45, 70, 29, 15, 21, 40, 20, 30, 42, 65, 95, 80, 70, 85, 73]
|
||||||
|
})
|
||||||
|
|
||||||
|
result = yli.descriptives._compute_eta(df, 'Subject', 'Score')
|
||||||
|
|
||||||
|
assert result == np.sqrt(6780/9640)
|
||||||
|
@ -238,32 +238,18 @@ def auto_correlations(df, cols):
|
|||||||
else:
|
else:
|
||||||
# Categorical-nominal, etc.
|
# Categorical-nominal, etc.
|
||||||
# Compute eta
|
# Compute eta
|
||||||
ssw = 0
|
statistic = _compute_eta(df_2cols, col1, col2)
|
||||||
ssb = 0
|
|
||||||
values_mean = df_2cols[col2].astype('float64').mean()
|
|
||||||
for category in df_2cols[col1].unique():
|
|
||||||
subgroup = df_2cols[df_2cols[col1] == category][col2].astype('float64')
|
|
||||||
ssw += ((subgroup - subgroup.mean())**2).sum()
|
|
||||||
ssb += len(subgroup) * (subgroup.mean() - values_mean)**2
|
|
||||||
statistic = (ssb / (ssb + ssw))**0.5
|
|
||||||
df_corr.loc[col1, col2] = statistic
|
df_corr.loc[col1, col2] = statistic
|
||||||
df_corr.loc[col2, col1] = statistic
|
df_corr.loc[col2, col1] = statistic
|
||||||
else:
|
else:
|
||||||
if col2 in categorical_columns and len(df_coded[col2].unique()) > 2:
|
if col2 in categorical_columns and len(df_coded[col2].unique()) > 2:
|
||||||
# Categorical-nominal, etc.
|
# Categorical-nominal, etc.
|
||||||
# Compute eta
|
# Compute eta
|
||||||
ssw = 0
|
statistic = _compute_eta(df_2cols, col2, col1)
|
||||||
ssb = 0
|
|
||||||
values_mean = df_2cols[col1].astype('float64').mean()
|
|
||||||
for category in df_2cols[col2].unique():
|
|
||||||
subgroup = df_2cols[df_2cols[col2] == category][col1].astype('float64')
|
|
||||||
ssw += ((subgroup - subgroup.mean())**2).sum()
|
|
||||||
ssb += len(subgroup) * (subgroup.mean() - values_mean)**2
|
|
||||||
statistic = (ssb / (ssb + ssw))**0.5
|
|
||||||
df_corr.loc[col1, col2] = statistic
|
df_corr.loc[col1, col2] = statistic
|
||||||
df_corr.loc[col2, col1] = statistic
|
df_corr.loc[col2, col1] = statistic
|
||||||
else:
|
else:
|
||||||
# Nominal-nominal, etc.
|
# Continuous-continuous, etc.
|
||||||
# Compute Pearson r (or Spearman rho, point-biserial, etc.)
|
# Compute Pearson r (or Spearman rho, point-biserial, etc.)
|
||||||
statistic = stats.pearsonr(df_2cols[col1], df_2cols[col2]).statistic
|
statistic = stats.pearsonr(df_2cols[col1], df_2cols[col2]).statistic
|
||||||
df_corr.loc[col1, col2] = statistic
|
df_corr.loc[col1, col2] = statistic
|
||||||
@ -274,6 +260,21 @@ def auto_correlations(df, cols):
|
|||||||
|
|
||||||
return AutoCorrelationsResult(df_corr)
|
return AutoCorrelationsResult(df_corr)
|
||||||
|
|
||||||
|
def _compute_eta(df, col_category, col_numeric):
|
||||||
|
"""
|
||||||
|
Compute the correlation ratio, *η*
|
||||||
|
"""
|
||||||
|
|
||||||
|
ssw = 0
|
||||||
|
ssb = 0
|
||||||
|
values_mean = df[col_numeric].astype('float64').mean()
|
||||||
|
for category in df[col_category].unique():
|
||||||
|
subgroup = df[df[col_category] == category][col_numeric].astype('float64')
|
||||||
|
ssw += ((subgroup - subgroup.mean())**2).sum()
|
||||||
|
ssb += len(subgroup) * (subgroup.mean() - values_mean)**2
|
||||||
|
statistic = (ssb / (ssb + ssw))**0.5
|
||||||
|
return statistic
|
||||||
|
|
||||||
class AutoCorrelationsResult:
|
class AutoCorrelationsResult:
|
||||||
"""
|
"""
|
||||||
Result of automatically computed pairwise correlation coefficients
|
Result of automatically computed pairwise correlation coefficients
|
||||||
|
Loading…
Reference in New Issue
Block a user