scipy-yli/yli/descriptives.py

249 lines
7.4 KiB
Python
Raw Normal View History

2022-11-10 21:20:06 +11:00
# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import pandas as pd
2022-12-03 22:23:29 +11:00
from scipy import stats
import seaborn as sns
2022-11-10 21:20:06 +11:00
from .config import config
2022-12-03 22:23:29 +11:00
from .utils import as_numeric, check_nan
2022-11-10 21:20:06 +11:00
def auto_descriptives(df, cols, *, ordinal_range=[]):
2022-11-10 21:20:06 +11:00
"""
Automatically compute descriptive summary statistics
The statistics computed are:
* For a categorical variable Counts of values
* For a continuous variable Mean and standard deviation
* For an ordinal variable Median and IQR (default) or range
2022-11-10 21:20:06 +11:00
There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported.
:param df: Data to summarise
:type df: DataFrame
:param cols: Columns in *df* for the variables to summarise
:type cols: List[str]
:param ordinal_range: Columns of ordinal variables in *df* to report median and range (rather than IQR)
2022-11-10 21:20:06 +11:00
:type ordinal_range: List[str]
:rtype: :class:`yli.descriptives.AutoDescriptivesResult`
"""
result_data = []
result_labels = []
for col in cols:
data_cleaned = df[col].dropna()
if data_cleaned.dtype == 'category' and data_cleaned.cat.ordered and data_cleaned.cat.categories.dtype in ('float64', 'int64', 'Float64', 'Int64'):
# Ordinal numeric data
data_cleaned = data_cleaned.astype('float64')
2022-11-10 21:20:06 +11:00
if col in ordinal_range:
# Report range
2022-11-10 21:20:06 +11:00
result_labels.append((
'{}, median (range)'.format(col),
'{}, median (range)'.format(col),
))
result_data.append((
'{:.2f} ({:.2f}{:.2f})'.format(data_cleaned.median(), data_cleaned.min(), data_cleaned.max()),
len(df) - len(data_cleaned)
))
else:
# Report IQR
2022-11-10 21:20:06 +11:00
result_labels.append((
'{}, median (IQR)'.format(col),
'{}, median (IQR)'.format(col),
))
result_data.append((
'{:.2f} ({:.2f}{:.2f})'.format(data_cleaned.median(), data_cleaned.quantile(0.25), data_cleaned.quantile(0.75)),
len(df) - len(data_cleaned)
))
elif data_cleaned.dtype in ('bool', 'boolean', 'category', 'object'):
# Categorical data
# FIXME: Sort order
values = sorted(data_cleaned.unique())
# Value counts
result_labels.append((
'{}, {}'.format(col, ':'.join(str(v) for v in values)),
'{}, {}'.format(col, ':'.join(str(v) for v in values)),
))
result_data.append((
':'.join(str((data_cleaned == v).sum()) for v in values),
len(df) - len(data_cleaned)
))
elif data_cleaned.dtype in ('float64', 'int64', 'Float64', 'Int64'):
# Continuous data
result_labels.append((
'{}, μ (SD)'.format(col),
'{}, <i>μ</i> (SD)'.format(col),
))
result_data.append((
'{:.2f} ({:.2f})'.format(data_cleaned.mean(), data_cleaned.std()),
len(df) - len(data_cleaned)
))
2022-11-10 21:20:06 +11:00
else:
raise Exception('Unsupported dtype for auto_descriptives, {}'.format(df[col].dtype))
return AutoDescriptivesResult(result_data=result_data, result_labels=result_labels)
class AutoDescriptivesResult:
"""
Result of automatically computed descriptive summary statistics
See :func:`yli.auto_descriptives`.
Results data stored within instances of this class is not intended to be directly accessed.
"""
def __init__(self, *, result_data, result_labels):
# List of tuples (variable summary, missing count)
self._result_data = result_data
# List of tuples (plaintext label, HTML label)
self._result_labels = result_labels
def __repr__(self):
if config.repr_is_summary:
return self.summary()
return super().__repr__()
def _repr_html_(self):
result = '<table><thead><tr><th></th><th></th><th>Missing</th></tr></thead><tbody>'
for data, label in zip(self._result_data, self._result_labels):
result += '<tr><th>{}</th><td>{}</td><td>{}</td></tr>'.format(label[1], data[0], data[1])
result += '</tbody></table>'
return result
def summary(self):
"""
Return a stringified summary of the tests of association
:rtype: str
"""
# Format data for output
result_labels_fmt = [r[0] for r in self._result_labels]
table = pd.DataFrame(self._result_data, index=result_labels_fmt, columns=['', 'Missing'])
return str(table)
2022-12-03 22:23:29 +11:00
def auto_correlations(df, cols):
# TODO: Documentation
def _col_to_numeric(col):
if col.dtype == 'category' and col.cat.ordered:
# Ordinal variable
# Factorise if required
col, _ = as_numeric(col)
# Code as ranks
col[col >= 0] = stats.rankdata(col[col >= 0])
# Put NaNs back
col = col.astype('float64')
col[col < 0] = pd.NA
return col
else:
# FIXME: Bools, binary, etc.
return col
# Code columns as numeric/ranks/etc. as appropriate
df_coded = pd.DataFrame(index=df.index)
for col_name in cols:
col = df[col_name]
if col.dtype == 'category' and col.cat.ordered:
# Ordinal variable
# Factorise if required
col, _ = as_numeric(col)
# Code as ranks
col[col >= 0] = stats.rankdata(col[col >= 0])
# Put NaNs back
col = col.astype('float64')
col[col < 0] = pd.NA
df_coded[col_name] = col
elif col.dtype in ('bool', 'boolean', 'category', 'object'):
cat_values = col.dropna().unique()
if len(cat_values) == 2:
# Categorical variable with 2 categories
# Code as 0/1/NA
cat_values = sorted(cat_values)
col = col.replace({cat_values[0]: 0, cat_values[1]: 1})
df_coded[col_name] = col
else:
# Categorical variable with >2 categories
# Create dummy variables
dummies = pd.get_dummies(col, prefix=col_name)
df_coded = df_coded.join(dummies)
else:
# Numeric variable, etc.
df_coded[col_name] = col
# Compute pairwise correlation
df_corr = pd.DataFrame(index=df_coded.columns, columns=df_coded.columns, dtype='float64')
for i, col1 in enumerate(df_coded.columns):
for col2 in df_coded.columns[:i]:
statistic = stats.pearsonr(df_coded[col1], df_coded[col2]).statistic
df_corr.loc[col1, col2] = statistic
df_corr.loc[col2, col1] = statistic
# Correlation with itself is always 1
df_corr.loc[col1, col1] = 1
return AutoCorrelationsResult(df_corr)
class AutoCorrelationsResult:
# TODO: Documentation
def __init__(self, correlations):
self.correlations = correlations
def __repr__(self):
if config.repr_is_summary:
return self.summary()
return super().__repr__()
def _repr_html_(self):
df_repr = self.correlations._repr_html_()
# Insert caption
idx_endopen = df_repr.index('>', df_repr.index('<table'))
df_repr = df_repr[:idx_endopen+1] + '<caption>Correlation Matrix</caption>' + df_repr[idx_endopen+1:]
return df_repr
def summary(self):
"""
Return a stringified summary of the correlation matrix
:rtype: str
"""
return 'Correlation Matrix\n\n' + str(self.correlations)
def plot(self):
sns.heatmap(self.correlations, vmin=-1, vmax=1, cmap='RdBu')