Use eta and Cramer's V in yli.auto_correlations for categorical variables

This commit is contained in:
RunasSudo 2024-05-16 22:36:09 +10:00
parent 553448e59a
commit 90cc780086
Signed by: RunasSudo
GPG Key ID: 7234E476BF21C61A

View File

@ -1,5 +1,5 @@
# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
# Copyright © 2022–2024 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
@ -146,8 +146,7 @@ def auto_correlations(df, cols):
"""
Automatically compute pairwise correlation coefficients
Dichotomous variables are coded as 0/1, according to which value is lower or higher in the natural sort order.
Categorical variables with more than 2 categories are coded with one-hot dummy variables for all categories.
Categorical variables are factorised and coded according to the natural sort order of the categories.
Ordinal variables are factorised and coded as ranks.
Pairwise Pearson correlation coefficients are then calculated on the coded data.
@ -159,6 +158,10 @@ def auto_correlations(df, cols):
* A continuous variable and dichotomous variable are compared using point-biserial correlation
* An ordinal variable and dichotomous variable are compared using rank-biserial correlation
The exception is involving a categorical variable with more than 2 categories.
Correlation of 2 such variables is by Cramér's *V*.
Correlation of 1 such variable with another type of variable is by the correlation ratio *η* (on ranks, if applicable).
There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported.
:param df: Data to compute correlations for
@ -169,27 +172,11 @@ def auto_correlations(df, cols):
:rtype: :class:`yli.descriptives.AutoCorrelationsResult`
"""
def _col_to_numeric(col):
if col.dtype == 'category' and col.cat.ordered:
# Ordinal variable
# Factorise if required
col, _ = as_numeric(col)
# Code as ranks
col[col >= 0] = stats.rankdata(col[col >= 0])
# Put NaNs back
col = col.astype('float64')
col[col < 0] = pd.NA
return col
else:
# FIXME: Bools, binary, etc.
return col
# Code columns as numeric/ranks/etc. as appropriate
df_coded = pd.DataFrame(index=df.index)
categorical_columns = []
for col_name in cols:
col = df[col_name]
@ -207,19 +194,29 @@ def auto_correlations(df, cols):
df_coded[col_name] = col
elif col.dtype in ('bool', 'boolean', 'category', 'object'):
# Categorical variable
# Factorise
cat_values = col.dropna().unique()
cat_values = sorted(cat_values)
col = col.replace({v: i for i, v in enumerate(cat_values)})
df_coded[col_name] = col
if len(cat_values) == 2:
categorical_columns.append(col_name)
#if len(cat_values) == 2:
# Categorical variable with 2 categories
# Code as 0/1/NA
cat_values = sorted(cat_values)
col = col.replace({cat_values[0]: 0, cat_values[1]: 1})
df_coded[col_name] = col
else:
#cat_values = sorted(cat_values)
#col = col.replace({cat_values[0]: 0, cat_values[1]: 1})
#df_coded[col_name] = col
#else:
# Categorical variable with >2 categories
# Factorise
# Create dummy variables
dummies = pd.get_dummies(col, prefix=col_name)
df_coded = df_coded.join(dummies)
#dummies = pd.get_dummies(col, prefix=col_name)
#df_coded = df_coded.join(dummies)
else:
# Numeric variable, etc.
df_coded[col_name] = col
@ -230,6 +227,44 @@ def auto_correlations(df, cols):
for i, col1 in enumerate(df_coded.columns):
for col2 in df_coded.columns[:i]:
df_2cols = df_coded[[col1, col2]].dropna()
if col1 in categorical_columns and len(df_coded[col1].unique()) > 2:
if col2 in categorical_columns and len(df_coded[col2].unique()) > 2:
# Categorical-categorical
# Compute Cramer's V
statistic = stats.contingency.association(df_2cols, method='cramer')
df_corr.loc[col1, col2] = statistic
df_corr.loc[col2, col1] = statistic
else:
# Categorical-nominal, etc.
# Compute eta
ssw = 0
ssb = 0
values_mean = df_2cols[col2].astype('float64').mean()
for category in df_2cols[col1].unique():
subgroup = df_2cols[df_2cols[col1] == category][col2].astype('float64')
ssw += ((subgroup - subgroup.mean())**2).sum()
ssb += len(subgroup) * (subgroup.mean() - values_mean)**2
statistic = (ssb / (ssb + ssw))**0.5
df_corr.loc[col1, col2] = statistic
df_corr.loc[col2, col1] = statistic
else:
if col2 in categorical_columns and len(df_coded[col2].unique()) > 2:
# Categorical-nominal, etc.
# Compute eta
ssw = 0
ssb = 0
values_mean = df_2cols[col1].astype('float64').mean()
for category in df_2cols[col2].unique():
subgroup = df_2cols[df_2cols[col2] == category][col1].astype('float64')
ssw += ((subgroup - subgroup.mean())**2).sum()
ssb += len(subgroup) * (subgroup.mean() - values_mean)**2
statistic = (ssb / (ssb + ssw))**0.5
df_corr.loc[col1, col2] = statistic
df_corr.loc[col2, col1] = statistic
else:
# Nominal-nominal, etc.
# Compute Pearson r (or Spearman rho, point-biserial, etc.)
statistic = stats.pearsonr(df_2cols[col1], df_2cols[col2]).statistic
df_corr.loc[col1, col2] = statistic
df_corr.loc[col2, col1] = statistic