Add documentation for auto_correlations
This commit is contained in:
parent
c2d4aaf8be
commit
68d7a31b53
@ -50,6 +50,7 @@ Optional dependencies are:
|
|||||||
* [rpy2](https://rpy2.github.io/), with R packages:
|
* [rpy2](https://rpy2.github.io/), with R packages:
|
||||||
* [BFpack](https://cran.r-project.org/web/packages/BFpack/index.html), for *bayesfactor_afbf* (*RegressionResult.bayesfactor_beta_zero*)
|
* [BFpack](https://cran.r-project.org/web/packages/BFpack/index.html), for *bayesfactor_afbf* (*RegressionResult.bayesfactor_beta_zero*)
|
||||||
* [logistf](https://cran.r-project.org/web/packages/logistf/index.html), for *PenalisedLogit*
|
* [logistf](https://cran.r-project.org/web/packages/logistf/index.html), for *PenalisedLogit*
|
||||||
|
* matplotlib and [seaborn](https://seaborn.pydata.org/), for plotting functions
|
||||||
|
|
||||||
## Functions
|
## Functions
|
||||||
|
|
||||||
|
@ -4,10 +4,15 @@ Descriptive statistics
|
|||||||
Functions
|
Functions
|
||||||
---------
|
---------
|
||||||
|
|
||||||
|
.. autofunction:: yli.auto_correlations
|
||||||
|
|
||||||
.. autofunction:: yli.auto_descriptives
|
.. autofunction:: yli.auto_descriptives
|
||||||
|
|
||||||
Result classes
|
Result classes
|
||||||
--------------
|
--------------
|
||||||
|
|
||||||
|
.. autoclass:: yli.descriptives.AutoCorrelationsResult
|
||||||
|
:members:
|
||||||
|
|
||||||
.. autoclass:: yli.descriptives.AutoDescriptivesResult
|
.. autoclass:: yli.descriptives.AutoDescriptivesResult
|
||||||
:members:
|
:members:
|
||||||
|
@ -16,7 +16,6 @@
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from scipy import stats
|
from scipy import stats
|
||||||
import seaborn as sns
|
|
||||||
|
|
||||||
from .config import config
|
from .config import config
|
||||||
from .utils import as_numeric, check_nan
|
from .utils import as_numeric, check_nan
|
||||||
@ -144,7 +143,31 @@ class AutoDescriptivesResult:
|
|||||||
return str(table)
|
return str(table)
|
||||||
|
|
||||||
def auto_correlations(df, cols):
|
def auto_correlations(df, cols):
|
||||||
# TODO: Documentation
|
"""
|
||||||
|
Automatically compute pairwise correlation coefficients
|
||||||
|
|
||||||
|
Dichotomous variables are coded as 0/1, according to which value is lower or higher in the natural sort order.
|
||||||
|
Categorical variables with more than 2 categories are coded with one-hot dummy variables for all categories.
|
||||||
|
Ordinal variables are factorised and coded as ranks.
|
||||||
|
Pairwise Pearson correlation coefficients are then calculated on the coded data.
|
||||||
|
|
||||||
|
The effect of the coding is that, for example:
|
||||||
|
|
||||||
|
* 2 continuous variables are compared using Pearson's *r*
|
||||||
|
* 2 ordinal variables are compared using Spearman's *ρ*
|
||||||
|
* 2 dichotomous variables are compared using Yule's *φ*
|
||||||
|
* A continuous variable and dichotomous variable are compared using point-biserial correlation
|
||||||
|
* An ordinal variable and dichotomous variable are compared using rank-biserial correlation
|
||||||
|
|
||||||
|
There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported.
|
||||||
|
|
||||||
|
:param df: Data to compute correlations for
|
||||||
|
:type df: DataFrame
|
||||||
|
:param cols: Columns in *df* for the variables to compute correlations for
|
||||||
|
:type cols: List[str]
|
||||||
|
|
||||||
|
:rtype: :class:`yli.descriptives.AutoCorrelationsResult`
|
||||||
|
"""
|
||||||
|
|
||||||
def _col_to_numeric(col):
|
def _col_to_numeric(col):
|
||||||
if col.dtype == 'category' and col.cat.ordered:
|
if col.dtype == 'category' and col.cat.ordered:
|
||||||
@ -206,7 +229,8 @@ def auto_correlations(df, cols):
|
|||||||
|
|
||||||
for i, col1 in enumerate(df_coded.columns):
|
for i, col1 in enumerate(df_coded.columns):
|
||||||
for col2 in df_coded.columns[:i]:
|
for col2 in df_coded.columns[:i]:
|
||||||
statistic = stats.pearsonr(df_coded[col1], df_coded[col2]).statistic
|
df_2cols = df_coded[[col1, col2]].dropna()
|
||||||
|
statistic = stats.pearsonr(df_2cols[col1], df_2cols[col2]).statistic
|
||||||
df_corr.loc[col1, col2] = statistic
|
df_corr.loc[col1, col2] = statistic
|
||||||
df_corr.loc[col2, col1] = statistic
|
df_corr.loc[col2, col1] = statistic
|
||||||
|
|
||||||
@ -216,9 +240,14 @@ def auto_correlations(df, cols):
|
|||||||
return AutoCorrelationsResult(df_corr)
|
return AutoCorrelationsResult(df_corr)
|
||||||
|
|
||||||
class AutoCorrelationsResult:
|
class AutoCorrelationsResult:
|
||||||
# TODO: Documentation
|
"""
|
||||||
|
Result of automatically computed pairwise correlation coefficients
|
||||||
|
|
||||||
|
See :func:`yli.auto_correlations`.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, correlations):
|
def __init__(self, correlations):
|
||||||
|
#: Pairwise correlation coefficients (*DataFrame*)
|
||||||
self.correlations = correlations
|
self.correlations = correlations
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
@ -245,4 +274,9 @@ class AutoCorrelationsResult:
|
|||||||
return 'Correlation Matrix\n\n' + str(self.correlations)
|
return 'Correlation Matrix\n\n' + str(self.correlations)
|
||||||
|
|
||||||
def plot(self):
|
def plot(self):
|
||||||
|
"""
|
||||||
|
Plot a heatmap of the pairwise correlation coefficients
|
||||||
|
"""
|
||||||
|
|
||||||
|
import seaborn as sns
|
||||||
sns.heatmap(self.correlations, vmin=-1, vmax=1, cmap='RdBu')
|
sns.heatmap(self.correlations, vmin=-1, vmax=1, cmap='RdBu')
|
||||||
|
Loading…
Reference in New Issue
Block a user