scipy-yli/yli/descriptives.py

283 lines
8.8 KiB
Python

# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import pandas as pd
from scipy import stats
from .config import config
from .utils import as_numeric, check_nan
def auto_descriptives(df, cols, *, ordinal_range=[]):
"""
Automatically compute descriptive summary statistics
The statistics computed are:
* For a categorical variable – Counts of values
* For a continuous variable – Mean and standard deviation
* For an ordinal variable – Median and IQR (default) or range
There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported.
:param df: Data to summarise
:type df: DataFrame
:param cols: Columns in *df* for the variables to summarise
:type cols: List[str]
:param ordinal_range: Columns of ordinal variables in *df* to report median and range (rather than IQR)
:type ordinal_range: List[str]
:rtype: :class:`yli.descriptives.AutoDescriptivesResult`
"""
result_data = []
result_labels = []
for col in cols:
data_cleaned = df[col].dropna()
if data_cleaned.dtype == 'category' and data_cleaned.cat.ordered and data_cleaned.cat.categories.dtype in ('float64', 'int64', 'Float64', 'Int64'):
# Ordinal numeric data
data_cleaned = data_cleaned.astype('float64')
if col in ordinal_range:
# Report range
result_labels.append((
'{}, median (range)'.format(col),
'{}, median (range)'.format(col),
))
result_data.append((
'{:.2f} ({:.2f}{:.2f})'.format(data_cleaned.median(), data_cleaned.min(), data_cleaned.max()),
len(df) - len(data_cleaned)
))
else:
# Report IQR
result_labels.append((
'{}, median (IQR)'.format(col),
'{}, median (IQR)'.format(col),
))
result_data.append((
'{:.2f} ({:.2f}{:.2f})'.format(data_cleaned.median(), data_cleaned.quantile(0.25), data_cleaned.quantile(0.75)),
len(df) - len(data_cleaned)
))
elif data_cleaned.dtype in ('bool', 'boolean', 'category', 'object'):
# Categorical data
# FIXME: Sort order
values = sorted(data_cleaned.unique())
# Value counts
result_labels.append((
'{}, {}'.format(col, ':'.join(str(v) for v in values)),
'{}, {}'.format(col, ':'.join(str(v) for v in values)),
))
result_data.append((
':'.join(str((data_cleaned == v).sum()) for v in values),
len(df) - len(data_cleaned)
))
elif data_cleaned.dtype in ('float64', 'int64', 'Float64', 'Int64'):
# Continuous data
result_labels.append((
'{}, μ (SD)'.format(col),
'{}, <i>μ</i> (SD)'.format(col),
))
result_data.append((
'{:.2f} ({:.2f})'.format(data_cleaned.mean(), data_cleaned.std()),
len(df) - len(data_cleaned)
))
else:
raise Exception('Unsupported dtype for auto_descriptives, {}'.format(df[col].dtype))
return AutoDescriptivesResult(result_data=result_data, result_labels=result_labels)
class AutoDescriptivesResult:
"""
Result of automatically computed descriptive summary statistics
See :func:`yli.auto_descriptives`.
Results data stored within instances of this class is not intended to be directly accessed.
"""
def __init__(self, *, result_data, result_labels):
# List of tuples (variable summary, missing count)
self._result_data = result_data
# List of tuples (plaintext label, HTML label)
self._result_labels = result_labels
def __repr__(self):
if config.repr_is_summary:
return self.summary()
return super().__repr__()
def _repr_html_(self):
result = '<table><thead><tr><th></th><th></th><th>Missing</th></tr></thead><tbody>'
for data, label in zip(self._result_data, self._result_labels):
result += '<tr><th>{}</th><td>{}</td><td>{}</td></tr>'.format(label[1], data[0], data[1])
result += '</tbody></table>'
return result
def summary(self):
"""
Return a stringified summary of the tests of association
:rtype: str
"""
# Format data for output
result_labels_fmt = [r[0] for r in self._result_labels]
table = pd.DataFrame(self._result_data, index=result_labels_fmt, columns=['', 'Missing'])
return str(table)
def auto_correlations(df, cols):
"""
Automatically compute pairwise correlation coefficients
Dichotomous variables are coded as 0/1, according to which value is lower or higher in the natural sort order.
Categorical variables with more than 2 categories are coded with one-hot dummy variables for all categories.
Ordinal variables are factorised and coded as ranks.
Pairwise Pearson correlation coefficients are then calculated on the coded data.
The effect of the coding is that, for example:
* 2 continuous variables are compared using Pearson's *r*
* 2 ordinal variables are compared using Spearman's *ρ*
* 2 dichotomous variables are compared using Yule's *φ*
* A continuous variable and dichotomous variable are compared using point-biserial correlation
* An ordinal variable and dichotomous variable are compared using rank-biserial correlation
There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported.
:param df: Data to compute correlations for
:type df: DataFrame
:param cols: Columns in *df* for the variables to compute correlations for
:type cols: List[str]
:rtype: :class:`yli.descriptives.AutoCorrelationsResult`
"""
def _col_to_numeric(col):
if col.dtype == 'category' and col.cat.ordered:
# Ordinal variable
# Factorise if required
col, _ = as_numeric(col)
# Code as ranks
col[col >= 0] = stats.rankdata(col[col >= 0])
# Put NaNs back
col = col.astype('float64')
col[col < 0] = pd.NA
return col
else:
# FIXME: Bools, binary, etc.
return col
# Code columns as numeric/ranks/etc. as appropriate
df_coded = pd.DataFrame(index=df.index)
for col_name in cols:
col = df[col_name]
if col.dtype == 'category' and col.cat.ordered:
# Ordinal variable
# Factorise if required
col, _ = as_numeric(col)
# Code as ranks
col[col >= 0] = stats.rankdata(col[col >= 0])
# Put NaNs back
col = col.astype('float64')
col[col < 0] = pd.NA
df_coded[col_name] = col
elif col.dtype in ('bool', 'boolean', 'category', 'object'):
cat_values = col.dropna().unique()
if len(cat_values) == 2:
# Categorical variable with 2 categories
# Code as 0/1/NA
cat_values = sorted(cat_values)
col = col.replace({cat_values[0]: 0, cat_values[1]: 1})
df_coded[col_name] = col
else:
# Categorical variable with >2 categories
# Create dummy variables
dummies = pd.get_dummies(col, prefix=col_name)
df_coded = df_coded.join(dummies)
else:
# Numeric variable, etc.
df_coded[col_name] = col
# Compute pairwise correlation
df_corr = pd.DataFrame(index=df_coded.columns, columns=df_coded.columns, dtype='float64')
for i, col1 in enumerate(df_coded.columns):
for col2 in df_coded.columns[:i]:
df_2cols = df_coded[[col1, col2]].dropna()
statistic = stats.pearsonr(df_2cols[col1], df_2cols[col2]).statistic
df_corr.loc[col1, col2] = statistic
df_corr.loc[col2, col1] = statistic
# Correlation with itself is always 1
df_corr.loc[col1, col1] = 1
return AutoCorrelationsResult(df_corr)
class AutoCorrelationsResult:
"""
Result of automatically computed pairwise correlation coefficients
See :func:`yli.auto_correlations`.
"""
def __init__(self, correlations):
#: Pairwise correlation coefficients (*DataFrame*)
self.correlations = correlations
def __repr__(self):
if config.repr_is_summary:
return self.summary()
return super().__repr__()
def _repr_html_(self):
df_repr = self.correlations._repr_html_()
# Insert caption
idx_endopen = df_repr.index('>', df_repr.index('<table'))
df_repr = df_repr[:idx_endopen+1] + '<caption>Correlation Matrix</caption>' + df_repr[idx_endopen+1:]
return df_repr
def summary(self):
"""
Return a stringified summary of the correlation matrix
:rtype: str
"""
return 'Correlation Matrix\n\n' + str(self.correlations)
def plot(self):
"""
Plot a heatmap of the pairwise correlation coefficients
"""
import seaborn as sns
sns.heatmap(self.correlations, vmin=-1, vmax=1, cmap='RdBu')