Use eta and Cramer's V in yli.auto_correlations for categorical variables
This commit is contained in:
parent
553448e59a
commit
90cc780086
@ -1,5 +1,5 @@
|
||||
# scipy-yli: Helpful SciPy utilities and recipes
|
||||
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
|
||||
# Copyright © 2022–2024 Lee Yingtong Li (RunasSudo)
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
@ -146,8 +146,7 @@ def auto_correlations(df, cols):
|
||||
"""
|
||||
Automatically compute pairwise correlation coefficients
|
||||
|
||||
Dichotomous variables are coded as 0/1, according to which value is lower or higher in the natural sort order.
|
||||
Categorical variables with more than 2 categories are coded with one-hot dummy variables for all categories.
|
||||
Categorical variables are factorised and coded according to the natural sort order of the categories.
|
||||
Ordinal variables are factorised and coded as ranks.
|
||||
Pairwise Pearson correlation coefficients are then calculated on the coded data.
|
||||
|
||||
@ -159,6 +158,10 @@ def auto_correlations(df, cols):
|
||||
* A continuous variable and dichotomous variable are compared using point-biserial correlation
|
||||
* An ordinal variable and dichotomous variable are compared using rank-biserial correlation
|
||||
|
||||
The exception is involving a categorical variable with more than 2 categories.
|
||||
Correlation of 2 such variables is by Cramér's *V*.
|
||||
Correlation of 1 such variable with another type of variable is by the correlation ratio *η* (on ranks, if applicable).
|
||||
|
||||
There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported.
|
||||
|
||||
:param df: Data to compute correlations for
|
||||
@ -169,27 +172,11 @@ def auto_correlations(df, cols):
|
||||
:rtype: :class:`yli.descriptives.AutoCorrelationsResult`
|
||||
"""
|
||||
|
||||
def _col_to_numeric(col):
|
||||
if col.dtype == 'category' and col.cat.ordered:
|
||||
# Ordinal variable
|
||||
# Factorise if required
|
||||
col, _ = as_numeric(col)
|
||||
|
||||
# Code as ranks
|
||||
col[col >= 0] = stats.rankdata(col[col >= 0])
|
||||
|
||||
# Put NaNs back
|
||||
col = col.astype('float64')
|
||||
col[col < 0] = pd.NA
|
||||
|
||||
return col
|
||||
else:
|
||||
# FIXME: Bools, binary, etc.
|
||||
return col
|
||||
|
||||
# Code columns as numeric/ranks/etc. as appropriate
|
||||
df_coded = pd.DataFrame(index=df.index)
|
||||
|
||||
|
||||
categorical_columns = []
|
||||
|
||||
for col_name in cols:
|
||||
col = df[col_name]
|
||||
|
||||
@ -207,19 +194,29 @@ def auto_correlations(df, cols):
|
||||
|
||||
df_coded[col_name] = col
|
||||
elif col.dtype in ('bool', 'boolean', 'category', 'object'):
|
||||
# Categorical variable
|
||||
# Factorise
|
||||
cat_values = col.dropna().unique()
|
||||
cat_values = sorted(cat_values)
|
||||
col = col.replace({v: i for i, v in enumerate(cat_values)})
|
||||
df_coded[col_name] = col
|
||||
|
||||
if len(cat_values) == 2:
|
||||
categorical_columns.append(col_name)
|
||||
|
||||
#if len(cat_values) == 2:
|
||||
# Categorical variable with 2 categories
|
||||
# Code as 0/1/NA
|
||||
cat_values = sorted(cat_values)
|
||||
col = col.replace({cat_values[0]: 0, cat_values[1]: 1})
|
||||
df_coded[col_name] = col
|
||||
else:
|
||||
#cat_values = sorted(cat_values)
|
||||
#col = col.replace({cat_values[0]: 0, cat_values[1]: 1})
|
||||
#df_coded[col_name] = col
|
||||
#else:
|
||||
# Categorical variable with >2 categories
|
||||
# Factorise
|
||||
|
||||
|
||||
# Create dummy variables
|
||||
dummies = pd.get_dummies(col, prefix=col_name)
|
||||
df_coded = df_coded.join(dummies)
|
||||
#dummies = pd.get_dummies(col, prefix=col_name)
|
||||
#df_coded = df_coded.join(dummies)
|
||||
else:
|
||||
# Numeric variable, etc.
|
||||
df_coded[col_name] = col
|
||||
@ -230,9 +227,47 @@ def auto_correlations(df, cols):
|
||||
for i, col1 in enumerate(df_coded.columns):
|
||||
for col2 in df_coded.columns[:i]:
|
||||
df_2cols = df_coded[[col1, col2]].dropna()
|
||||
statistic = stats.pearsonr(df_2cols[col1], df_2cols[col2]).statistic
|
||||
df_corr.loc[col1, col2] = statistic
|
||||
df_corr.loc[col2, col1] = statistic
|
||||
|
||||
if col1 in categorical_columns and len(df_coded[col1].unique()) > 2:
|
||||
if col2 in categorical_columns and len(df_coded[col2].unique()) > 2:
|
||||
# Categorical-categorical
|
||||
# Compute Cramer's V
|
||||
statistic = stats.contingency.association(df_2cols, method='cramer')
|
||||
df_corr.loc[col1, col2] = statistic
|
||||
df_corr.loc[col2, col1] = statistic
|
||||
else:
|
||||
# Categorical-nominal, etc.
|
||||
# Compute eta
|
||||
ssw = 0
|
||||
ssb = 0
|
||||
values_mean = df_2cols[col2].astype('float64').mean()
|
||||
for category in df_2cols[col1].unique():
|
||||
subgroup = df_2cols[df_2cols[col1] == category][col2].astype('float64')
|
||||
ssw += ((subgroup - subgroup.mean())**2).sum()
|
||||
ssb += len(subgroup) * (subgroup.mean() - values_mean)**2
|
||||
statistic = (ssb / (ssb + ssw))**0.5
|
||||
df_corr.loc[col1, col2] = statistic
|
||||
df_corr.loc[col2, col1] = statistic
|
||||
else:
|
||||
if col2 in categorical_columns and len(df_coded[col2].unique()) > 2:
|
||||
# Categorical-nominal, etc.
|
||||
# Compute eta
|
||||
ssw = 0
|
||||
ssb = 0
|
||||
values_mean = df_2cols[col1].astype('float64').mean()
|
||||
for category in df_2cols[col2].unique():
|
||||
subgroup = df_2cols[df_2cols[col2] == category][col1].astype('float64')
|
||||
ssw += ((subgroup - subgroup.mean())**2).sum()
|
||||
ssb += len(subgroup) * (subgroup.mean() - values_mean)**2
|
||||
statistic = (ssb / (ssb + ssw))**0.5
|
||||
df_corr.loc[col1, col2] = statistic
|
||||
df_corr.loc[col2, col1] = statistic
|
||||
else:
|
||||
# Nominal-nominal, etc.
|
||||
# Compute Pearson r (or Spearman rho, point-biserial, etc.)
|
||||
statistic = stats.pearsonr(df_2cols[col1], df_2cols[col2]).statistic
|
||||
df_corr.loc[col1, col2] = statistic
|
||||
df_corr.loc[col2, col1] = statistic
|
||||
|
||||
# Correlation with itself is always 1
|
||||
df_corr.loc[col1, col1] = 1
|
||||
|
Loading…
Reference in New Issue
Block a user