From 68d7a31b5375a5995af6b47af1e9837b5594e210 Mon Sep 17 00:00:00 2001 From: RunasSudo Date: Tue, 7 Feb 2023 18:49:00 +1100 Subject: [PATCH] Add documentation for auto_correlations --- README.md | 1 + docs/descriptives.rst | 5 +++++ yli/descriptives.py | 42 ++++++++++++++++++++++++++++++++++++++---- 3 files changed, 44 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 7264288..462acdd 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,7 @@ Optional dependencies are: * [rpy2](https://rpy2.github.io/), with R packages: * [BFpack](https://cran.r-project.org/web/packages/BFpack/index.html), for *bayesfactor_afbf* (*RegressionResult.bayesfactor_beta_zero*) * [logistf](https://cran.r-project.org/web/packages/logistf/index.html), for *PenalisedLogit* +* matplotlib and [seaborn](https://seaborn.pydata.org/), for plotting functions ## Functions diff --git a/docs/descriptives.rst b/docs/descriptives.rst index d6aaf0f..fea8c01 100644 --- a/docs/descriptives.rst +++ b/docs/descriptives.rst @@ -4,10 +4,15 @@ Descriptive statistics Functions --------- +.. autofunction:: yli.auto_correlations + .. autofunction:: yli.auto_descriptives Result classes -------------- +.. autoclass:: yli.descriptives.AutoCorrelationsResult + :members: + .. autoclass:: yli.descriptives.AutoDescriptivesResult :members: diff --git a/yli/descriptives.py b/yli/descriptives.py index dadbe79..95eb029 100644 --- a/yli/descriptives.py +++ b/yli/descriptives.py @@ -16,7 +16,6 @@ import pandas as pd from scipy import stats -import seaborn as sns from .config import config from .utils import as_numeric, check_nan @@ -144,7 +143,31 @@ class AutoDescriptivesResult: return str(table) def auto_correlations(df, cols): - # TODO: Documentation + """ + Automatically compute pairwise correlation coefficients + + Dichotomous variables are coded as 0/1, according to which value is lower or higher in the natural sort order. + Categorical variables with more than 2 categories are coded with one-hot dummy variables for all categories. + Ordinal variables are factorised and coded as ranks. + Pairwise Pearson correlation coefficients are then calculated on the coded data. + + The effect of the coding is that, for example: + + * 2 continuous variables are compared using Pearson's *r* + * 2 ordinal variables are compared using Spearman's *ρ* + * 2 dichotomous variables are compared using Yule's *φ* + * A continuous variable and dichotomous variable are compared using point-biserial correlation + * An ordinal variable and dichotomous variable are compared using rank-biserial correlation + + There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported. + + :param df: Data to compute correlations for + :type df: DataFrame + :param cols: Columns in *df* for the variables to compute correlations for + :type cols: List[str] + + :rtype: :class:`yli.descriptives.AutoCorrelationsResult` + """ def _col_to_numeric(col): if col.dtype == 'category' and col.cat.ordered: @@ -206,7 +229,8 @@ def auto_correlations(df, cols): for i, col1 in enumerate(df_coded.columns): for col2 in df_coded.columns[:i]: - statistic = stats.pearsonr(df_coded[col1], df_coded[col2]).statistic + df_2cols = df_coded[[col1, col2]].dropna() + statistic = stats.pearsonr(df_2cols[col1], df_2cols[col2]).statistic df_corr.loc[col1, col2] = statistic df_corr.loc[col2, col1] = statistic @@ -216,9 +240,14 @@ def auto_correlations(df, cols): return AutoCorrelationsResult(df_corr) class AutoCorrelationsResult: - # TODO: Documentation + """ + Result of automatically computed pairwise correlation coefficients + + See :func:`yli.auto_correlations`. + """ def __init__(self, correlations): + #: Pairwise correlation coefficients (*DataFrame*) self.correlations = correlations def __repr__(self): @@ -245,4 +274,9 @@ class AutoCorrelationsResult: return 'Correlation Matrix\n\n' + str(self.correlations) def plot(self): + """ + Plot a heatmap of the pairwise correlation coefficients + """ + + import seaborn as sns sns.heatmap(self.correlations, vmin=-1, vmax=1, cmap='RdBu')