# scipy-yli: Helpful SciPy utilities and recipes # Copyright © 2022 Lee Yingtong Li (RunasSudo) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import pandas as pd from scipy import stats import seaborn as sns from .config import config from .utils import as_numeric, check_nan def auto_descriptives(df, cols, *, ordinal_range=[]): """ Automatically compute descriptive summary statistics The statistics computed are: * For a categorical variable – Counts of values * For a continuous variable – Mean and standard deviation * For an ordinal variable – Median and IQR (default) or range There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported. :param df: Data to summarise :type df: DataFrame :param cols: Columns in *df* for the variables to summarise :type cols: List[str] :param ordinal_range: Columns of ordinal variables in *df* to report median and range (rather than IQR) :type ordinal_range: List[str] :rtype: :class:`yli.descriptives.AutoDescriptivesResult` """ result_data = [] result_labels = [] for col in cols: data_cleaned = df[col].dropna() if data_cleaned.dtype == 'category' and data_cleaned.cat.ordered and data_cleaned.cat.categories.dtype in ('float64', 'int64', 'Float64', 'Int64'): # Ordinal numeric data data_cleaned = data_cleaned.astype('float64') if col in ordinal_range: # Report range result_labels.append(( '{}, median (range)'.format(col), '{}, median (range)'.format(col), )) result_data.append(( '{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.min(), data_cleaned.max()), len(df) - len(data_cleaned) )) else: # Report IQR result_labels.append(( '{}, median (IQR)'.format(col), '{}, median (IQR)'.format(col), )) result_data.append(( '{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.quantile(0.25), data_cleaned.quantile(0.75)), len(df) - len(data_cleaned) )) elif data_cleaned.dtype in ('bool', 'boolean', 'category', 'object'): # Categorical data # FIXME: Sort order values = sorted(data_cleaned.unique()) # Value counts result_labels.append(( '{}, {}'.format(col, ':'.join(str(v) for v in values)), '{}, {}'.format(col, ':'.join(str(v) for v in values)), )) result_data.append(( ':'.join(str((data_cleaned == v).sum()) for v in values), len(df) - len(data_cleaned) )) elif data_cleaned.dtype in ('float64', 'int64', 'Float64', 'Int64'): # Continuous data result_labels.append(( '{}, μ (SD)'.format(col), '{}, μ (SD)'.format(col), )) result_data.append(( '{:.2f} ({:.2f})'.format(data_cleaned.mean(), data_cleaned.std()), len(df) - len(data_cleaned) )) else: raise Exception('Unsupported dtype for auto_descriptives, {}'.format(df[col].dtype)) return AutoDescriptivesResult(result_data=result_data, result_labels=result_labels) class AutoDescriptivesResult: """ Result of automatically computed descriptive summary statistics See :func:`yli.auto_descriptives`. Results data stored within instances of this class is not intended to be directly accessed. """ def __init__(self, *, result_data, result_labels): # List of tuples (variable summary, missing count) self._result_data = result_data # List of tuples (plaintext label, HTML label) self._result_labels = result_labels def __repr__(self): if config.repr_is_summary: return self.summary() return super().__repr__() def _repr_html_(self): result = '' for data, label in zip(self._result_data, self._result_labels): result += ''.format(label[1], data[0], data[1]) result += '
Missing
{}{}{}
' return result def summary(self): """ Return a stringified summary of the tests of association :rtype: str """ # Format data for output result_labels_fmt = [r[0] for r in self._result_labels] table = pd.DataFrame(self._result_data, index=result_labels_fmt, columns=['', 'Missing']) return str(table) def auto_correlations(df, cols): # TODO: Documentation def _col_to_numeric(col): if col.dtype == 'category' and col.cat.ordered: # Ordinal variable # Factorise if required col, _ = as_numeric(col) # Code as ranks col[col >= 0] = stats.rankdata(col[col >= 0]) # Put NaNs back col = col.astype('float64') col[col < 0] = pd.NA return col else: # FIXME: Bools, binary, etc. return col # Code columns as numeric/ranks/etc. as appropriate df_coded = pd.DataFrame(index=df.index) for col_name in cols: col = df[col_name] if col.dtype == 'category' and col.cat.ordered: # Ordinal variable # Factorise if required col, _ = as_numeric(col) # Code as ranks col[col >= 0] = stats.rankdata(col[col >= 0]) # Put NaNs back col = col.astype('float64') col[col < 0] = pd.NA df_coded[col_name] = col elif col.dtype in ('bool', 'boolean', 'category', 'object'): cat_values = col.dropna().unique() if len(cat_values) == 2: # Categorical variable with 2 categories # Code as 0/1/NA cat_values = sorted(cat_values) col = col.replace({cat_values[0]: 0, cat_values[1]: 1}) df_coded[col_name] = col else: # Categorical variable with >2 categories # Create dummy variables dummies = pd.get_dummies(col, prefix=col_name) df_coded = df_coded.join(dummies) else: # Numeric variable, etc. df_coded[col_name] = col # Compute pairwise correlation df_corr = pd.DataFrame(index=df_coded.columns, columns=df_coded.columns, dtype='float64') for i, col1 in enumerate(df_coded.columns): for col2 in df_coded.columns[:i]: statistic = stats.pearsonr(df_coded[col1], df_coded[col2]).statistic df_corr.loc[col1, col2] = statistic df_corr.loc[col2, col1] = statistic # Correlation with itself is always 1 df_corr.loc[col1, col1] = 1 return AutoCorrelationsResult(df_corr) class AutoCorrelationsResult: # TODO: Documentation def __init__(self, correlations): self.correlations = correlations def __repr__(self): if config.repr_is_summary: return self.summary() return super().__repr__() def _repr_html_(self): df_repr = self.correlations._repr_html_() # Insert caption idx_endopen = df_repr.index('>', df_repr.index('Correlation Matrix' + df_repr[idx_endopen+1:] return df_repr def summary(self): """ Return a stringified summary of the correlation matrix :rtype: str """ return 'Correlation Matrix\n\n' + str(self.correlations) def plot(self): sns.heatmap(self.correlations, vmin=-1, vmax=1, cmap='RdBu')