diff --git a/yli/descriptives.py b/yli/descriptives.py
index f798dbe..a27b7ec 100644
--- a/yli/descriptives.py
+++ b/yli/descriptives.py
@@ -19,7 +19,7 @@ import pandas as pd
from .config import config
from .utils import check_nan
-def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]):
+def auto_descriptives(df, cols, *, ordinal_range=[]):
"""
Automatically compute descriptive summary statistics
@@ -27,7 +27,7 @@ def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]):
* For a categorical variable – Counts of values
* For a continuous variable – Mean and standard deviation
- * For an ordinal variable – Median and range or IQR
+ * For an ordinal variable – Median and IQR (default) or range
There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported.
@@ -35,10 +35,8 @@ def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]):
:type df: DataFrame
:param cols: Columns in *df* for the variables to summarise
:type cols: List[str]
- :param ordinal_range: Columns in *df* to treat as ordinal, and report median and range
+ :param ordinal_range: Columns of ordinal variables in *df* to report median and range (rather than IQR)
:type ordinal_range: List[str]
- :param ordinal_iqr: Columns in *df* to treat as ordinal, and report median and IQR
- :type ordinal_iqr: List[str]
:rtype: :class:`yli.descriptives.AutoDescriptivesResult`
"""
@@ -49,7 +47,31 @@ def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]):
for col in cols:
data_cleaned = df[col].dropna()
- if data_cleaned.dtype in ('bool', 'boolean', 'category', 'object'):
+ if data_cleaned.dtype == 'category' and data_cleaned.cat.ordered and data_cleaned.cat.categories.dtype in ('float64', 'int64', 'Float64', 'Int64'):
+ # Ordinal numeric data
+ data_cleaned = data_cleaned.astype('float64')
+
+ if col in ordinal_range:
+ # Report range
+ result_labels.append((
+ '{}, median (range)'.format(col),
+ '{}, median (range)'.format(col),
+ ))
+ result_data.append((
+ '{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.min(), data_cleaned.max()),
+ len(df) - len(data_cleaned)
+ ))
+ else:
+ # Report IQR
+ result_labels.append((
+ '{}, median (IQR)'.format(col),
+ '{}, median (IQR)'.format(col),
+ ))
+ result_data.append((
+ '{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.quantile(0.25), data_cleaned.quantile(0.75)),
+ len(df) - len(data_cleaned)
+ ))
+ elif data_cleaned.dtype in ('bool', 'boolean', 'category', 'object'):
# Categorical data
# FIXME: Sort order
values = sorted(data_cleaned.unique())
@@ -64,36 +86,15 @@ def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]):
len(df) - len(data_cleaned)
))
elif data_cleaned.dtype in ('float64', 'int64', 'Float64', 'Int64'):
- if col in ordinal_range:
- # Ordinal data (report range)
- result_labels.append((
- '{}, median (range)'.format(col),
- '{}, median (range)'.format(col),
- ))
- result_data.append((
- '{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.min(), data_cleaned.max()),
- len(df) - len(data_cleaned)
- ))
- elif col in ordinal_iqr:
- # Ordinal data (report IQR)
- result_labels.append((
- '{}, median (IQR)'.format(col),
- '{}, median (IQR)'.format(col),
- ))
- result_data.append((
- '{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.quantile(0.25), data_cleaned.quantile(0.75)),
- len(df) - len(data_cleaned)
- ))
- else:
- # Continuous data
- result_labels.append((
- '{}, μ (SD)'.format(col),
- '{}, μ (SD)'.format(col),
- ))
- result_data.append((
- '{:.2f} ({:.2f})'.format(data_cleaned.mean(), data_cleaned.std()),
- len(df) - len(data_cleaned)
- ))
+ # Continuous data
+ result_labels.append((
+ '{}, μ (SD)'.format(col),
+ '{}, μ (SD)'.format(col),
+ ))
+ result_data.append((
+ '{:.2f} ({:.2f})'.format(data_cleaned.mean(), data_cleaned.std()),
+ len(df) - len(data_cleaned)
+ ))
else:
raise Exception('Unsupported dtype for auto_descriptives, {}'.format(df[col].dtype))