From e268f385be712fb4cbe2d218b8939e83f8f1116c Mon Sep 17 00:00:00 2001 From: RunasSudo Date: Thu, 10 Nov 2022 21:20:06 +1100 Subject: [PATCH] Implement yli.auto_descriptives --- docs/descriptives.rst | 13 ++++ docs/index.rst | 1 + yli/__init__.py | 1 + yli/descriptives.py | 140 ++++++++++++++++++++++++++++++++++++++++++ yli/sig_tests.py | 2 +- 5 files changed, 156 insertions(+), 1 deletion(-) create mode 100644 docs/descriptives.rst create mode 100644 yli/descriptives.py diff --git a/docs/descriptives.rst b/docs/descriptives.rst new file mode 100644 index 0000000..d6aaf0f --- /dev/null +++ b/docs/descriptives.rst @@ -0,0 +1,13 @@ +Descriptive statistics +====================== + +Functions +--------- + +.. autofunction:: yli.auto_descriptives + +Result classes +-------------- + +.. autoclass:: yli.descriptives.AutoDescriptivesResult + :members: diff --git a/docs/index.rst b/docs/index.rst index 0ae28de..9dd97b7 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -6,6 +6,7 @@ scipy-yli API reference :caption: Contents: general.rst + descriptives.rst sig_tests.rst regress.rst io.rst diff --git a/yli/__init__.py b/yli/__init__.py index 89415d7..476f901 100644 --- a/yli/__init__.py +++ b/yli/__init__.py @@ -16,6 +16,7 @@ from .bayes_factors import bayesfactor_afbf from .config import config +from .descriptives import auto_descriptives from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted from .regress import PenalisedLogit, logit_then_regress, regress, vif diff --git a/yli/descriptives.py b/yli/descriptives.py new file mode 100644 index 0000000..eeb9a8e --- /dev/null +++ b/yli/descriptives.py @@ -0,0 +1,140 @@ +# scipy-yli: Helpful SciPy utilities and recipes +# Copyright © 2022 Lee Yingtong Li (RunasSudo) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import pandas as pd + +from .config import config +from .utils import check_nan + +def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]): + """ + Automatically compute descriptive summary statistics + + The statistics computed are: + + * For a categorical variable – Counts of values + * For a continuous variable – Mean and standard deviation + * For an ordinal variable – Median and range or IQR + + There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported. + + :param df: Data to summarise + :type df: DataFrame + :param cols: Columns in *df* for the variables to summarise + :type cols: List[str] + :param ordinal_range: Columns in *df* to treat as ordinal, and report median and range + :type ordinal_range: List[str] + :param ordinal_iqr: Columns in *df* to treat as ordinal, and report median and IQR + :type ordinal_iqr: List[str] + + :rtype: :class:`yli.descriptives.AutoDescriptivesResult` + """ + + result_data = [] + result_labels = [] + + for col in cols: + data_cleaned = df[col].dropna() + + if data_cleaned.dtype in ('bool', 'boolean', 'category', 'object'): + # Categorical data + values = sorted(data_cleaned.unique()) + + # Value counts + result_labels.append(( + '{}, {}'.format(col, ':'.join(str(v) for v in values)), + '{}, {}'.format(col, ':'.join(str(v) for v in values)), + )) + result_data.append(( + ':'.join(str((data_cleaned == v).sum()) for v in values), + len(df) - len(data_cleaned) + )) + elif data_cleaned.dtype in ('float64', 'int64', 'Float64', 'Int64'): + if col in ordinal_range: + # Ordinal data (report range) + result_labels.append(( + '{}, median (range)'.format(col), + '{}, median (range)'.format(col), + )) + result_data.append(( + '{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.min(), data_cleaned.max()), + len(df) - len(data_cleaned) + )) + elif col in ordinal_iqr: + # Ordinal data (report IQR) + result_labels.append(( + '{}, median (IQR)'.format(col), + '{}, median (IQR)'.format(col), + )) + result_data.append(( + '{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.quantile(0.25), data_cleaned.quantile(0.75)), + len(df) - len(data_cleaned) + )) + else: + # Continuous data + result_labels.append(( + '{}, μ (SD)'.format(col), + '{}, μ (SD)'.format(col), + )) + result_data.append(( + '{:.2f} ({:.2f})'.format(data_cleaned.mean(), data_cleaned.std()), + len(df) - len(data_cleaned) + )) + else: + raise Exception('Unsupported dtype for auto_descriptives, {}'.format(df[col].dtype)) + + return AutoDescriptivesResult(result_data=result_data, result_labels=result_labels) + +class AutoDescriptivesResult: + """ + Result of automatically computed descriptive summary statistics + + See :func:`yli.auto_descriptives`. + + Results data stored within instances of this class is not intended to be directly accessed. + """ + + def __init__(self, *, result_data, result_labels): + # List of tuples (variable summary, missing count) + self._result_data = result_data + # List of tuples (plaintext label, HTML label) + self._result_labels = result_labels + + def __repr__(self): + if config.repr_is_summary: + return self.summary() + return super().__repr__() + + def _repr_html_(self): + result = '' + + for data, label in zip(self._result_data, self._result_labels): + result += ''.format(label[1], data[0], data[1]) + + result += '
Missing
{}{}{}
' + return result + + def summary(self): + """ + Return a stringified summary of the tests of association + + :rtype: str + """ + + # Format data for output + result_labels_fmt = [r[0] for r in self._result_labels] + table = pd.DataFrame(self._result_data, index=result_labels_fmt, columns=['', 'Missing']) + return str(table) diff --git a/yli/sig_tests.py b/yli/sig_tests.py index 528988b..364f63e 100644 --- a/yli/sig_tests.py +++ b/yli/sig_tests.py @@ -722,7 +722,7 @@ class AutoBinaryResult: # List of tuples (first group summary, second group summary, test result) self._result_data = result_data - # List of row labels for the independente variables + # List of tuples (plaintext label, HTML label) self._result_labels = result_labels def __repr__(self):