Use eta and Cramer's V in yli.auto_correlations for categorical variables

2024-05-16 22:36:09 +10:00 · 2024-05-16 22:36:09 +10:00 · 90cc780086
commit 90cc780086
parent 553448e59a
1 changed files with 67 additions and 32 deletions
--- a/yli/descriptives.py
+++ b/yli/descriptives.py
@ -1,5 +1,5 @@
 #   scipy-yli: Helpful SciPy utilities and recipes
-#   Copyright © 2022  Lee Yingtong Li (RunasSudo)
+#   Copyright © 2022–2024  Lee Yingtong Li (RunasSudo)
 #
 #   This program is free software: you can redistribute it and/or modify
 #   it under the terms of the GNU Affero General Public License as published by
@ -146,8 +146,7 @@ def auto_correlations(df, cols):
 	"""
 	Automatically compute pairwise correlation coefficients
 	
-	Dichotomous variables are coded as 0/1, according to which value is lower or higher in the natural sort order.
-	Categorical variables with more than 2 categories are coded with one-hot dummy variables for all categories.
+	Categorical variables are factorised and coded according to the natural sort order of the categories.
 	Ordinal variables are factorised and coded as ranks.
 	Pairwise Pearson correlation coefficients are then calculated on the coded data.
 	
@ -159,6 +158,10 @@ def auto_correlations(df, cols):
 	* A continuous variable and dichotomous variable are compared using point-biserial correlation
 	* An ordinal variable and dichotomous variable are compared using rank-biserial correlation
 	
+	The exception is involving a categorical variable with more than 2 categories.
+	Correlation of 2 such variables is by Cramér's *V*.
+	Correlation of 1 such variable with another type of variable is by the correlation ratio *η* (on ranks, if applicable).
+	
 	There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported.
 	
 	:param df: Data to compute correlations for
@ -169,27 +172,11 @@ def auto_correlations(df, cols):
 	:rtype: :class:`yli.descriptives.AutoCorrelationsResult`
 	"""
 	
-	def _col_to_numeric(col):
-		if col.dtype == 'category' and col.cat.ordered:
-			# Ordinal variable
-			# Factorise if required
-			col, _ = as_numeric(col)
-			
-			# Code as ranks
-			col[col >= 0] = stats.rankdata(col[col >= 0])
-			
-			# Put NaNs back
-			col = col.astype('float64')
-			col[col < 0] = pd.NA
-			
-			return col
-		else:
-			# FIXME: Bools, binary, etc.
-			return col
-	
 	# Code columns as numeric/ranks/etc. as appropriate
 	df_coded = pd.DataFrame(index=df.index)
-
+	
+	categorical_columns = []
+	
 	for col_name in cols:
 		col = df[col_name]
 		
@ -207,19 +194,29 @@ def auto_correlations(df, cols):
 			
 			df_coded[col_name] = col
 		elif col.dtype in ('bool', 'boolean', 'category', 'object'):
+			# Categorical variable
+			# Factorise
 			cat_values = col.dropna().unique()
+			cat_values = sorted(cat_values)
+			col = col.replace({v: i for i, v in enumerate(cat_values)})
+			df_coded[col_name] = col
 			
-			if len(cat_values) == 2:
+			categorical_columns.append(col_name)
+			
+			#if len(cat_values) == 2:
 				# Categorical variable with 2 categories
 				# Code as 0/1/NA
-				cat_values = sorted(cat_values)
-				col = col.replace({cat_values[0]: 0, cat_values[1]: 1})
-				df_coded[col_name] = col
-			else:
+				#cat_values = sorted(cat_values)
+				#col = col.replace({cat_values[0]: 0, cat_values[1]: 1})
+				#df_coded[col_name] = col
+			#else:
 				# Categorical variable with >2 categories
+				# Factorise
+				
+				
 				# Create dummy variables
-				dummies = pd.get_dummies(col, prefix=col_name)
-				df_coded = df_coded.join(dummies)
+				#dummies = pd.get_dummies(col, prefix=col_name)
+				#df_coded = df_coded.join(dummies)
 		else:
 			# Numeric variable, etc.
 			df_coded[col_name] = col
@ -230,9 +227,47 @@ def auto_correlations(df, cols):
 	for i, col1 in enumerate(df_coded.columns):
 		for col2 in df_coded.columns[:i]:
 			df_2cols = df_coded[[col1, col2]].dropna()
-			statistic = stats.pearsonr(df_2cols[col1], df_2cols[col2]).statistic
-			df_corr.loc[col1, col2] = statistic
-			df_corr.loc[col2, col1] = statistic
+			
+			if col1 in categorical_columns and len(df_coded[col1].unique()) > 2:
+				if col2 in categorical_columns and len(df_coded[col2].unique()) > 2:
+					# Categorical-categorical
+					# Compute Cramer's V
+					statistic = stats.contingency.association(df_2cols, method='cramer')
+					df_corr.loc[col1, col2] = statistic
+					df_corr.loc[col2, col1] = statistic
+				else:
+					# Categorical-nominal, etc.
+					# Compute eta
+					ssw = 0
+					ssb = 0
+					values_mean = df_2cols[col2].astype('float64').mean()
+					for category in df_2cols[col1].unique():
+						subgroup = df_2cols[df_2cols[col1] == category][col2].astype('float64')
+						ssw += ((subgroup - subgroup.mean())**2).sum()
+						ssb += len(subgroup) * (subgroup.mean() - values_mean)**2
+					statistic = (ssb / (ssb + ssw))**0.5
+					df_corr.loc[col1, col2] = statistic
+					df_corr.loc[col2, col1] = statistic
+			else:
+				if col2 in categorical_columns and len(df_coded[col2].unique()) > 2:
+					# Categorical-nominal, etc.
+					# Compute eta
+					ssw = 0
+					ssb = 0
+					values_mean = df_2cols[col1].astype('float64').mean()
+					for category in df_2cols[col2].unique():
+						subgroup = df_2cols[df_2cols[col2] == category][col1].astype('float64')
+						ssw += ((subgroup - subgroup.mean())**2).sum()
+						ssb += len(subgroup) * (subgroup.mean() - values_mean)**2
+					statistic = (ssb / (ssb + ssw))**0.5
+					df_corr.loc[col1, col2] = statistic
+					df_corr.loc[col2, col1] = statistic
+				else:
+					# Nominal-nominal, etc.
+					# Compute Pearson r (or Spearman rho, point-biserial, etc.)
+					statistic = stats.pearsonr(df_2cols[col1], df_2cols[col2]).statistic
+					df_corr.loc[col1, col2] = statistic
+					df_corr.loc[col2, col1] = statistic
 		
 		# Correlation with itself is always 1
 		df_corr.loc[col1, col1] = 1