Add documentation for auto_correlations

This commit is contained in:
RunasSudo 2023-02-07 18:49:00 +11:00
parent c2d4aaf8be
commit 68d7a31b53
Signed by: RunasSudo
GPG Key ID: 7234E476BF21C61A
3 changed files with 44 additions and 4 deletions

View File

@ -50,6 +50,7 @@ Optional dependencies are:
* [rpy2](https://rpy2.github.io/), with R packages:
* [BFpack](https://cran.r-project.org/web/packages/BFpack/index.html), for *bayesfactor_afbf* (*RegressionResult.bayesfactor_beta_zero*)
* [logistf](https://cran.r-project.org/web/packages/logistf/index.html), for *PenalisedLogit*
* matplotlib and [seaborn](https://seaborn.pydata.org/), for plotting functions
## Functions

View File

@ -4,10 +4,15 @@ Descriptive statistics
Functions
---------
.. autofunction:: yli.auto_correlations
.. autofunction:: yli.auto_descriptives
Result classes
--------------
.. autoclass:: yli.descriptives.AutoCorrelationsResult
:members:
.. autoclass:: yli.descriptives.AutoDescriptivesResult
:members:

View File

@ -16,7 +16,6 @@
import pandas as pd
from scipy import stats
import seaborn as sns
from .config import config
from .utils import as_numeric, check_nan
@ -144,7 +143,31 @@ class AutoDescriptivesResult:
return str(table)
def auto_correlations(df, cols):
# TODO: Documentation
"""
Automatically compute pairwise correlation coefficients
Dichotomous variables are coded as 0/1, according to which value is lower or higher in the natural sort order.
Categorical variables with more than 2 categories are coded with one-hot dummy variables for all categories.
Ordinal variables are factorised and coded as ranks.
Pairwise Pearson correlation coefficients are then calculated on the coded data.
The effect of the coding is that, for example:
* 2 continuous variables are compared using Pearson's *r*
* 2 ordinal variables are compared using Spearman's *ρ*
* 2 dichotomous variables are compared using Yule's *φ*
* A continuous variable and dichotomous variable are compared using point-biserial correlation
* An ordinal variable and dichotomous variable are compared using rank-biserial correlation
There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported.
:param df: Data to compute correlations for
:type df: DataFrame
:param cols: Columns in *df* for the variables to compute correlations for
:type cols: List[str]
:rtype: :class:`yli.descriptives.AutoCorrelationsResult`
"""
def _col_to_numeric(col):
if col.dtype == 'category' and col.cat.ordered:
@ -206,7 +229,8 @@ def auto_correlations(df, cols):
for i, col1 in enumerate(df_coded.columns):
for col2 in df_coded.columns[:i]:
statistic = stats.pearsonr(df_coded[col1], df_coded[col2]).statistic
df_2cols = df_coded[[col1, col2]].dropna()
statistic = stats.pearsonr(df_2cols[col1], df_2cols[col2]).statistic
df_corr.loc[col1, col2] = statistic
df_corr.loc[col2, col1] = statistic
@ -216,9 +240,14 @@ def auto_correlations(df, cols):
return AutoCorrelationsResult(df_corr)
class AutoCorrelationsResult:
# TODO: Documentation
"""
Result of automatically computed pairwise correlation coefficients
See :func:`yli.auto_correlations`.
"""
def __init__(self, correlations):
#: Pairwise correlation coefficients (*DataFrame*)
self.correlations = correlations
def __repr__(self):
@ -245,4 +274,9 @@ class AutoCorrelationsResult:
return 'Correlation Matrix\n\n' + str(self.correlations)
def plot(self):
"""
Plot a heatmap of the pairwise correlation coefficients
"""
import seaborn as sns
sns.heatmap(self.correlations, vmin=-1, vmax=1, cmap='RdBu')