# scipy-yli: Helpful SciPy utilities and recipes # Copyright © 2022 Lee Yingtong Li (RunasSudo) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import pandas as pd from .config import config from .utils import check_nan def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]): """ Automatically compute descriptive summary statistics The statistics computed are: * For a categorical variable – Counts of values * For a continuous variable – Mean and standard deviation * For an ordinal variable – Median and range or IQR There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported. :param df: Data to summarise :type df: DataFrame :param cols: Columns in *df* for the variables to summarise :type cols: List[str] :param ordinal_range: Columns in *df* to treat as ordinal, and report median and range :type ordinal_range: List[str] :param ordinal_iqr: Columns in *df* to treat as ordinal, and report median and IQR :type ordinal_iqr: List[str] :rtype: :class:`yli.descriptives.AutoDescriptivesResult` """ result_data = [] result_labels = [] for col in cols: data_cleaned = df[col].dropna() if data_cleaned.dtype in ('bool', 'boolean', 'category', 'object'): # Categorical data values = sorted(data_cleaned.unique()) # Value counts result_labels.append(( '{}, {}'.format(col, ':'.join(str(v) for v in values)), '{}, {}'.format(col, ':'.join(str(v) for v in values)), )) result_data.append(( ':'.join(str((data_cleaned == v).sum()) for v in values), len(df) - len(data_cleaned) )) elif data_cleaned.dtype in ('float64', 'int64', 'Float64', 'Int64'): if col in ordinal_range: # Ordinal data (report range) result_labels.append(( '{}, median (range)'.format(col), '{}, median (range)'.format(col), )) result_data.append(( '{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.min(), data_cleaned.max()), len(df) - len(data_cleaned) )) elif col in ordinal_iqr: # Ordinal data (report IQR) result_labels.append(( '{}, median (IQR)'.format(col), '{}, median (IQR)'.format(col), )) result_data.append(( '{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.quantile(0.25), data_cleaned.quantile(0.75)), len(df) - len(data_cleaned) )) else: # Continuous data result_labels.append(( '{}, μ (SD)'.format(col), '{}, μ (SD)'.format(col), )) result_data.append(( '{:.2f} ({:.2f})'.format(data_cleaned.mean(), data_cleaned.std()), len(df) - len(data_cleaned) )) else: raise Exception('Unsupported dtype for auto_descriptives, {}'.format(df[col].dtype)) return AutoDescriptivesResult(result_data=result_data, result_labels=result_labels) class AutoDescriptivesResult: """ Result of automatically computed descriptive summary statistics See :func:`yli.auto_descriptives`. Results data stored within instances of this class is not intended to be directly accessed. """ def __init__(self, *, result_data, result_labels): # List of tuples (variable summary, missing count) self._result_data = result_data # List of tuples (plaintext label, HTML label) self._result_labels = result_labels def __repr__(self): if config.repr_is_summary: return self.summary() return super().__repr__() def _repr_html_(self): result = '' for data, label in zip(self._result_data, self._result_labels): result += ''.format(label[1], data[0], data[1]) result += '
Missing
{}{}{}
' return result def summary(self): """ Return a stringified summary of the tests of association :rtype: str """ # Format data for output result_labels_fmt = [r[0] for r in self._result_labels] table = pd.DataFrame(self._result_data, index=result_labels_fmt, columns=['', 'Missing']) return str(table)