Add documentation for auto_correlations

2023-02-07 18:49:00 +11:00 · 2023-02-07 18:49:00 +11:00 · 68d7a31b53
commit 68d7a31b53
parent c2d4aaf8be
3 changed files with 44 additions and 4 deletions
--- a/README.md
+++ b/README.md
@ -50,6 +50,7 @@ Optional dependencies are:
 * [rpy2](https://rpy2.github.io/), with R packages:
 	* [BFpack](https://cran.r-project.org/web/packages/BFpack/index.html), for *bayesfactor_afbf* (*RegressionResult.bayesfactor_beta_zero*)
 	* [logistf](https://cran.r-project.org/web/packages/logistf/index.html), for *PenalisedLogit*
 * matplotlib and [seaborn](https://seaborn.pydata.org/), for plotting functions
 ## Functions
--- a/docs/descriptives.rst
+++ b/docs/descriptives.rst
@ -4,10 +4,15 @@ Descriptive statistics
 Functions
 ---------
 .. autofunction:: yli.auto_correlations
 .. autofunction:: yli.auto_descriptives
 Result classes
 --------------
 .. autoclass:: yli.descriptives.AutoCorrelationsResult
 	:members:
 .. autoclass:: yli.descriptives.AutoDescriptivesResult
 	:members:
--- a/yli/descriptives.py
+++ b/yli/descriptives.py
@ -16,7 +16,6 @@
 import pandas as pd
 from scipy import stats
 import seaborn as sns
 from .config import config
 from .utils import as_numeric, check_nan
@ -144,7 +143,31 @@ class AutoDescriptivesResult:
 		return str(table)
 def auto_correlations(df, cols):
-	# TODO: Documentation
+	"""
 	Automatically compute pairwise correlation coefficients
 	Dichotomous variables are coded as 0/1, according to which value is lower or higher in the natural sort order.
 	Categorical variables with more than 2 categories are coded with one-hot dummy variables for all categories.
 	Ordinal variables are factorised and coded as ranks.
 	Pairwise Pearson correlation coefficients are then calculated on the coded data.
 	The effect of the coding is that, for example:
 	* 2 continuous variables are compared using Pearson's *r*
 	* 2 ordinal variables are compared using Spearman's *ρ*
 	* 2 dichotomous variables are compared using Yule's *φ*
 	* A continuous variable and dichotomous variable are compared using point-biserial correlation
 	* An ordinal variable and dichotomous variable are compared using rank-biserial correlation
 	There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported.
 	:param df: Data to compute correlations for
 	:type df: DataFrame
 	:param cols: Columns in *df* for the variables to compute correlations for
 	:type cols: List[str]
 	:rtype: :class:`yli.descriptives.AutoCorrelationsResult`
 	"""
 	def _col_to_numeric(col):
 		if col.dtype == 'category' and col.cat.ordered:
@ -206,7 +229,8 @@ def auto_correlations(df, cols):
 	for i, col1 in enumerate(df_coded.columns):
 		for col2 in df_coded.columns[:i]:
-			statistic = stats.pearsonr(df_coded[col1], df_coded[col2]).statistic
+			df_2cols = df_coded[[col1, col2]].dropna()
 			statistic = stats.pearsonr(df_2cols[col1], df_2cols[col2]).statistic
 			df_corr.loc[col1, col2] = statistic
 			df_corr.loc[col2, col1] = statistic
@ -216,9 +240,14 @@ def auto_correlations(df, cols):
 	return AutoCorrelationsResult(df_corr)
 class AutoCorrelationsResult:
-	# TODO: Documentation
+	"""
 	Result of automatically computed pairwise correlation coefficients
 	See :func:`yli.auto_correlations`.
 	"""
 	def __init__(self, correlations):
 		#: Pairwise correlation coefficients (*DataFrame*)
 		self.correlations = correlations
 	def __repr__(self):
@ -245,4 +274,9 @@ class AutoCorrelationsResult:
 		return 'Correlation Matrix\n\n' + str(self.correlations)
 	def plot(self):
 		"""
 		Plot a heatmap of the pairwise correlation coefficients
 		"""
 		import seaborn as sns
 		sns.heatmap(self.correlations, vmin=-1, vmax=1, cmap='RdBu')