From 0fa261498aff1cd5ee00af1d62a231b72b069cab Mon Sep 17 00:00:00 2001 From: RunasSudo Date: Sat, 3 Dec 2022 22:19:24 +1100 Subject: [PATCH] In auto_descriptives, autodetect ordinal variables based on category dtype --- yli/descriptives.py | 73 +++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/yli/descriptives.py b/yli/descriptives.py index f798dbe..a27b7ec 100644 --- a/yli/descriptives.py +++ b/yli/descriptives.py @@ -19,7 +19,7 @@ import pandas as pd from .config import config from .utils import check_nan -def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]): +def auto_descriptives(df, cols, *, ordinal_range=[]): """ Automatically compute descriptive summary statistics @@ -27,7 +27,7 @@ def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]): * For a categorical variable – Counts of values * For a continuous variable – Mean and standard deviation - * For an ordinal variable – Median and range or IQR + * For an ordinal variable – Median and IQR (default) or range There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported. @@ -35,10 +35,8 @@ def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]): :type df: DataFrame :param cols: Columns in *df* for the variables to summarise :type cols: List[str] - :param ordinal_range: Columns in *df* to treat as ordinal, and report median and range + :param ordinal_range: Columns of ordinal variables in *df* to report median and range (rather than IQR) :type ordinal_range: List[str] - :param ordinal_iqr: Columns in *df* to treat as ordinal, and report median and IQR - :type ordinal_iqr: List[str] :rtype: :class:`yli.descriptives.AutoDescriptivesResult` """ @@ -49,7 +47,31 @@ def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]): for col in cols: data_cleaned = df[col].dropna() - if data_cleaned.dtype in ('bool', 'boolean', 'category', 'object'): + if data_cleaned.dtype == 'category' and data_cleaned.cat.ordered and data_cleaned.cat.categories.dtype in ('float64', 'int64', 'Float64', 'Int64'): + # Ordinal numeric data + data_cleaned = data_cleaned.astype('float64') + + if col in ordinal_range: + # Report range + result_labels.append(( + '{}, median (range)'.format(col), + '{}, median (range)'.format(col), + )) + result_data.append(( + '{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.min(), data_cleaned.max()), + len(df) - len(data_cleaned) + )) + else: + # Report IQR + result_labels.append(( + '{}, median (IQR)'.format(col), + '{}, median (IQR)'.format(col), + )) + result_data.append(( + '{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.quantile(0.25), data_cleaned.quantile(0.75)), + len(df) - len(data_cleaned) + )) + elif data_cleaned.dtype in ('bool', 'boolean', 'category', 'object'): # Categorical data # FIXME: Sort order values = sorted(data_cleaned.unique()) @@ -64,36 +86,15 @@ def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]): len(df) - len(data_cleaned) )) elif data_cleaned.dtype in ('float64', 'int64', 'Float64', 'Int64'): - if col in ordinal_range: - # Ordinal data (report range) - result_labels.append(( - '{}, median (range)'.format(col), - '{}, median (range)'.format(col), - )) - result_data.append(( - '{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.min(), data_cleaned.max()), - len(df) - len(data_cleaned) - )) - elif col in ordinal_iqr: - # Ordinal data (report IQR) - result_labels.append(( - '{}, median (IQR)'.format(col), - '{}, median (IQR)'.format(col), - )) - result_data.append(( - '{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.quantile(0.25), data_cleaned.quantile(0.75)), - len(df) - len(data_cleaned) - )) - else: - # Continuous data - result_labels.append(( - '{}, μ (SD)'.format(col), - '{}, μ (SD)'.format(col), - )) - result_data.append(( - '{:.2f} ({:.2f})'.format(data_cleaned.mean(), data_cleaned.std()), - len(df) - len(data_cleaned) - )) + # Continuous data + result_labels.append(( + '{}, μ (SD)'.format(col), + '{}, μ (SD)'.format(col), + )) + result_data.append(( + '{:.2f} ({:.2f})'.format(data_cleaned.mean(), data_cleaned.std()), + len(df) - len(data_cleaned) + )) else: raise Exception('Unsupported dtype for auto_descriptives, {}'.format(df[col].dtype))