From 90cc7800866db0bdc25bf07f14eaf6a445bc7101 Mon Sep 17 00:00:00 2001 From: RunasSudo Date: Thu, 16 May 2024 22:36:09 +1000 Subject: [PATCH] Use eta and Cramer's V in yli.auto_correlations for categorical variables --- yli/descriptives.py | 99 ++++++++++++++++++++++++++++++--------------- 1 file changed, 67 insertions(+), 32 deletions(-) diff --git a/yli/descriptives.py b/yli/descriptives.py index 95eb029..1ce4c9e 100644 --- a/yli/descriptives.py +++ b/yli/descriptives.py @@ -1,5 +1,5 @@ # scipy-yli: Helpful SciPy utilities and recipes -# Copyright © 2022 Lee Yingtong Li (RunasSudo) +# Copyright © 2022–2024 Lee Yingtong Li (RunasSudo) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by @@ -146,8 +146,7 @@ def auto_correlations(df, cols): """ Automatically compute pairwise correlation coefficients - Dichotomous variables are coded as 0/1, according to which value is lower or higher in the natural sort order. - Categorical variables with more than 2 categories are coded with one-hot dummy variables for all categories. + Categorical variables are factorised and coded according to the natural sort order of the categories. Ordinal variables are factorised and coded as ranks. Pairwise Pearson correlation coefficients are then calculated on the coded data. @@ -159,6 +158,10 @@ def auto_correlations(df, cols): * A continuous variable and dichotomous variable are compared using point-biserial correlation * An ordinal variable and dichotomous variable are compared using rank-biserial correlation + The exception is involving a categorical variable with more than 2 categories. + Correlation of 2 such variables is by Cramér's *V*. + Correlation of 1 such variable with another type of variable is by the correlation ratio *η* (on ranks, if applicable). + There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported. :param df: Data to compute correlations for @@ -169,27 +172,11 @@ def auto_correlations(df, cols): :rtype: :class:`yli.descriptives.AutoCorrelationsResult` """ - def _col_to_numeric(col): - if col.dtype == 'category' and col.cat.ordered: - # Ordinal variable - # Factorise if required - col, _ = as_numeric(col) - - # Code as ranks - col[col >= 0] = stats.rankdata(col[col >= 0]) - - # Put NaNs back - col = col.astype('float64') - col[col < 0] = pd.NA - - return col - else: - # FIXME: Bools, binary, etc. - return col - # Code columns as numeric/ranks/etc. as appropriate df_coded = pd.DataFrame(index=df.index) - + + categorical_columns = [] + for col_name in cols: col = df[col_name] @@ -207,19 +194,29 @@ def auto_correlations(df, cols): df_coded[col_name] = col elif col.dtype in ('bool', 'boolean', 'category', 'object'): + # Categorical variable + # Factorise cat_values = col.dropna().unique() + cat_values = sorted(cat_values) + col = col.replace({v: i for i, v in enumerate(cat_values)}) + df_coded[col_name] = col - if len(cat_values) == 2: + categorical_columns.append(col_name) + + #if len(cat_values) == 2: # Categorical variable with 2 categories # Code as 0/1/NA - cat_values = sorted(cat_values) - col = col.replace({cat_values[0]: 0, cat_values[1]: 1}) - df_coded[col_name] = col - else: + #cat_values = sorted(cat_values) + #col = col.replace({cat_values[0]: 0, cat_values[1]: 1}) + #df_coded[col_name] = col + #else: # Categorical variable with >2 categories + # Factorise + + # Create dummy variables - dummies = pd.get_dummies(col, prefix=col_name) - df_coded = df_coded.join(dummies) + #dummies = pd.get_dummies(col, prefix=col_name) + #df_coded = df_coded.join(dummies) else: # Numeric variable, etc. df_coded[col_name] = col @@ -230,9 +227,47 @@ def auto_correlations(df, cols): for i, col1 in enumerate(df_coded.columns): for col2 in df_coded.columns[:i]: df_2cols = df_coded[[col1, col2]].dropna() - statistic = stats.pearsonr(df_2cols[col1], df_2cols[col2]).statistic - df_corr.loc[col1, col2] = statistic - df_corr.loc[col2, col1] = statistic + + if col1 in categorical_columns and len(df_coded[col1].unique()) > 2: + if col2 in categorical_columns and len(df_coded[col2].unique()) > 2: + # Categorical-categorical + # Compute Cramer's V + statistic = stats.contingency.association(df_2cols, method='cramer') + df_corr.loc[col1, col2] = statistic + df_corr.loc[col2, col1] = statistic + else: + # Categorical-nominal, etc. + # Compute eta + ssw = 0 + ssb = 0 + values_mean = df_2cols[col2].astype('float64').mean() + for category in df_2cols[col1].unique(): + subgroup = df_2cols[df_2cols[col1] == category][col2].astype('float64') + ssw += ((subgroup - subgroup.mean())**2).sum() + ssb += len(subgroup) * (subgroup.mean() - values_mean)**2 + statistic = (ssb / (ssb + ssw))**0.5 + df_corr.loc[col1, col2] = statistic + df_corr.loc[col2, col1] = statistic + else: + if col2 in categorical_columns and len(df_coded[col2].unique()) > 2: + # Categorical-nominal, etc. + # Compute eta + ssw = 0 + ssb = 0 + values_mean = df_2cols[col1].astype('float64').mean() + for category in df_2cols[col2].unique(): + subgroup = df_2cols[df_2cols[col2] == category][col1].astype('float64') + ssw += ((subgroup - subgroup.mean())**2).sum() + ssb += len(subgroup) * (subgroup.mean() - values_mean)**2 + statistic = (ssb / (ssb + ssw))**0.5 + df_corr.loc[col1, col2] = statistic + df_corr.loc[col2, col1] = statistic + else: + # Nominal-nominal, etc. + # Compute Pearson r (or Spearman rho, point-biserial, etc.) + statistic = stats.pearsonr(df_2cols[col1], df_2cols[col2]).statistic + df_corr.loc[col1, col2] = statistic + df_corr.loc[col2, col1] = statistic # Correlation with itself is always 1 df_corr.loc[col1, col1] = 1