Use eta and Cramer's V in yli.auto_correlations for categorical variables

This commit is contained in:
RunasSudo 2024-05-16 22:36:09 +10:00
parent 553448e59a
commit 90cc780086
Signed by: RunasSudo
GPG Key ID: 7234E476BF21C61A

View File

@ -1,5 +1,5 @@
# scipy-yli: Helpful SciPy utilities and recipes # scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022 Lee Yingtong Li (RunasSudo) # Copyright © 2022–2024 Lee Yingtong Li (RunasSudo)
# #
# This program is free software: you can redistribute it and/or modify # This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by # it under the terms of the GNU Affero General Public License as published by
@ -146,8 +146,7 @@ def auto_correlations(df, cols):
""" """
Automatically compute pairwise correlation coefficients Automatically compute pairwise correlation coefficients
Dichotomous variables are coded as 0/1, according to which value is lower or higher in the natural sort order. Categorical variables are factorised and coded according to the natural sort order of the categories.
Categorical variables with more than 2 categories are coded with one-hot dummy variables for all categories.
Ordinal variables are factorised and coded as ranks. Ordinal variables are factorised and coded as ranks.
Pairwise Pearson correlation coefficients are then calculated on the coded data. Pairwise Pearson correlation coefficients are then calculated on the coded data.
@ -159,6 +158,10 @@ def auto_correlations(df, cols):
* A continuous variable and dichotomous variable are compared using point-biserial correlation * A continuous variable and dichotomous variable are compared using point-biserial correlation
* An ordinal variable and dichotomous variable are compared using rank-biserial correlation * An ordinal variable and dichotomous variable are compared using rank-biserial correlation
The exception is involving a categorical variable with more than 2 categories.
Correlation of 2 such variables is by Cramér's *V*.
Correlation of 1 such variable with another type of variable is by the correlation ratio *η* (on ranks, if applicable).
There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported. There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported.
:param df: Data to compute correlations for :param df: Data to compute correlations for
@ -169,27 +172,11 @@ def auto_correlations(df, cols):
:rtype: :class:`yli.descriptives.AutoCorrelationsResult` :rtype: :class:`yli.descriptives.AutoCorrelationsResult`
""" """
def _col_to_numeric(col):
if col.dtype == 'category' and col.cat.ordered:
# Ordinal variable
# Factorise if required
col, _ = as_numeric(col)
# Code as ranks
col[col >= 0] = stats.rankdata(col[col >= 0])
# Put NaNs back
col = col.astype('float64')
col[col < 0] = pd.NA
return col
else:
# FIXME: Bools, binary, etc.
return col
# Code columns as numeric/ranks/etc. as appropriate # Code columns as numeric/ranks/etc. as appropriate
df_coded = pd.DataFrame(index=df.index) df_coded = pd.DataFrame(index=df.index)
categorical_columns = []
for col_name in cols: for col_name in cols:
col = df[col_name] col = df[col_name]
@ -207,19 +194,29 @@ def auto_correlations(df, cols):
df_coded[col_name] = col df_coded[col_name] = col
elif col.dtype in ('bool', 'boolean', 'category', 'object'): elif col.dtype in ('bool', 'boolean', 'category', 'object'):
# Categorical variable
# Factorise
cat_values = col.dropna().unique() cat_values = col.dropna().unique()
cat_values = sorted(cat_values)
col = col.replace({v: i for i, v in enumerate(cat_values)})
df_coded[col_name] = col
if len(cat_values) == 2: categorical_columns.append(col_name)
#if len(cat_values) == 2:
# Categorical variable with 2 categories # Categorical variable with 2 categories
# Code as 0/1/NA # Code as 0/1/NA
cat_values = sorted(cat_values) #cat_values = sorted(cat_values)
col = col.replace({cat_values[0]: 0, cat_values[1]: 1}) #col = col.replace({cat_values[0]: 0, cat_values[1]: 1})
df_coded[col_name] = col #df_coded[col_name] = col
else: #else:
# Categorical variable with >2 categories # Categorical variable with >2 categories
# Factorise
# Create dummy variables # Create dummy variables
dummies = pd.get_dummies(col, prefix=col_name) #dummies = pd.get_dummies(col, prefix=col_name)
df_coded = df_coded.join(dummies) #df_coded = df_coded.join(dummies)
else: else:
# Numeric variable, etc. # Numeric variable, etc.
df_coded[col_name] = col df_coded[col_name] = col
@ -230,6 +227,44 @@ def auto_correlations(df, cols):
for i, col1 in enumerate(df_coded.columns): for i, col1 in enumerate(df_coded.columns):
for col2 in df_coded.columns[:i]: for col2 in df_coded.columns[:i]:
df_2cols = df_coded[[col1, col2]].dropna() df_2cols = df_coded[[col1, col2]].dropna()
if col1 in categorical_columns and len(df_coded[col1].unique()) > 2:
if col2 in categorical_columns and len(df_coded[col2].unique()) > 2:
# Categorical-categorical
# Compute Cramer's V
statistic = stats.contingency.association(df_2cols, method='cramer')
df_corr.loc[col1, col2] = statistic
df_corr.loc[col2, col1] = statistic
else:
# Categorical-nominal, etc.
# Compute eta
ssw = 0
ssb = 0
values_mean = df_2cols[col2].astype('float64').mean()
for category in df_2cols[col1].unique():
subgroup = df_2cols[df_2cols[col1] == category][col2].astype('float64')
ssw += ((subgroup - subgroup.mean())**2).sum()
ssb += len(subgroup) * (subgroup.mean() - values_mean)**2
statistic = (ssb / (ssb + ssw))**0.5
df_corr.loc[col1, col2] = statistic
df_corr.loc[col2, col1] = statistic
else:
if col2 in categorical_columns and len(df_coded[col2].unique()) > 2:
# Categorical-nominal, etc.
# Compute eta
ssw = 0
ssb = 0
values_mean = df_2cols[col1].astype('float64').mean()
for category in df_2cols[col2].unique():
subgroup = df_2cols[df_2cols[col2] == category][col1].astype('float64')
ssw += ((subgroup - subgroup.mean())**2).sum()
ssb += len(subgroup) * (subgroup.mean() - values_mean)**2
statistic = (ssb / (ssb + ssw))**0.5
df_corr.loc[col1, col2] = statistic
df_corr.loc[col2, col1] = statistic
else:
# Nominal-nominal, etc.
# Compute Pearson r (or Spearman rho, point-biserial, etc.)
statistic = stats.pearsonr(df_2cols[col1], df_2cols[col2]).statistic statistic = stats.pearsonr(df_2cols[col1], df_2cols[col2]).statistic
df_corr.loc[col1, col2] = statistic df_corr.loc[col1, col2] = statistic
df_corr.loc[col2, col1] = statistic df_corr.loc[col2, col1] = statistic