Add documentation for auto_correlations

2023-02-07 18:49:00 +11:00 · 2023-02-07 18:49:00 +11:00 · 68d7a31b53
commit 68d7a31b53
parent c2d4aaf8be
3 changed files with 44 additions and 4 deletions
--- a/README.md
+++ b/README.md
@ -50,6 +50,7 @@ Optional dependencies are:
 * [rpy2](https://rpy2.github.io/), with R packages:
 	* [BFpack](https://cran.r-project.org/web/packages/BFpack/index.html), for *bayesfactor_afbf* (*RegressionResult.bayesfactor_beta_zero*)
 	* [logistf](https://cran.r-project.org/web/packages/logistf/index.html), for *PenalisedLogit*
+* matplotlib and [seaborn](https://seaborn.pydata.org/), for plotting functions

 ## Functions

--- a/docs/descriptives.rst
+++ b/docs/descriptives.rst
@ -4,10 +4,15 @@ Descriptive statistics
 Functions
 ---------

+.. autofunction:: yli.auto_correlations
+
 .. autofunction:: yli.auto_descriptives

 Result classes
 --------------

+.. autoclass:: yli.descriptives.AutoCorrelationsResult
+	:members:
+
 .. autoclass:: yli.descriptives.AutoDescriptivesResult
 	:members:
--- a/yli/descriptives.py
+++ b/yli/descriptives.py
@ -16,7 +16,6 @@

 import pandas as pd
 from scipy import stats
-import seaborn as sns

 from .config import config
 from .utils import as_numeric, check_nan
@ -144,7 +143,31 @@ class AutoDescriptivesResult:
 		return str(table)

 def auto_correlations(df, cols):
-	# TODO: Documentation
+	"""
+	Automatically compute pairwise correlation coefficients
+	
+	Dichotomous variables are coded as 0/1, according to which value is lower or higher in the natural sort order.
+	Categorical variables with more than 2 categories are coded with one-hot dummy variables for all categories.
+	Ordinal variables are factorised and coded as ranks.
+	Pairwise Pearson correlation coefficients are then calculated on the coded data.
+	
+	The effect of the coding is that, for example:
+	
+	* 2 continuous variables are compared using Pearson's *r*
+	* 2 ordinal variables are compared using Spearman's *ρ*
+	* 2 dichotomous variables are compared using Yule's *φ*
+	* A continuous variable and dichotomous variable are compared using point-biserial correlation
+	* An ordinal variable and dichotomous variable are compared using rank-biserial correlation
+	
+	There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported.
+	
+	:param df: Data to compute correlations for
+	:type df: DataFrame
+	:param cols: Columns in *df* for the variables to compute correlations for
+	:type cols: List[str]
+	
+	:rtype: :class:`yli.descriptives.AutoCorrelationsResult`
+	"""
 	
 	def _col_to_numeric(col):
 		if col.dtype == 'category' and col.cat.ordered:
@ -206,7 +229,8 @@ def auto_correlations(df, cols):
 	
 	for i, col1 in enumerate(df_coded.columns):
 		for col2 in df_coded.columns[:i]:
-			statistic = stats.pearsonr(df_coded[col1], df_coded[col2]).statistic
+			df_2cols = df_coded[[col1, col2]].dropna()
+			statistic = stats.pearsonr(df_2cols[col1], df_2cols[col2]).statistic
 			df_corr.loc[col1, col2] = statistic
 			df_corr.loc[col2, col1] = statistic
 		
@ -216,9 +240,14 @@ def auto_correlations(df, cols):
 	return AutoCorrelationsResult(df_corr)

 class AutoCorrelationsResult:
-	# TODO: Documentation
+	"""
+	Result of automatically computed pairwise correlation coefficients
+	
+	See :func:`yli.auto_correlations`.
+	"""
 	
 	def __init__(self, correlations):
+		#: Pairwise correlation coefficients (*DataFrame*)
 		self.correlations = correlations
 	
 	def __repr__(self):
@ -245,4 +274,9 @@ class AutoCorrelationsResult:
 		return 'Correlation Matrix\n\n' + str(self.correlations)
 	
 	def plot(self):
+		"""
+		Plot a heatmap of the pairwise correlation coefficients
+		"""
+		
+		import seaborn as sns
 		sns.heatmap(self.correlations, vmin=-1, vmax=1, cmap='RdBu')