In auto_descriptives, autodetect ordinal variables based on category dtype
This commit is contained in:
parent
5633a191f1
commit
0fa261498a
@ -19,7 +19,7 @@ import pandas as pd
|
|||||||
from .config import config
|
from .config import config
|
||||||
from .utils import check_nan
|
from .utils import check_nan
|
||||||
|
|
||||||
def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]):
|
def auto_descriptives(df, cols, *, ordinal_range=[]):
|
||||||
"""
|
"""
|
||||||
Automatically compute descriptive summary statistics
|
Automatically compute descriptive summary statistics
|
||||||
|
|
||||||
@ -27,7 +27,7 @@ def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]):
|
|||||||
|
|
||||||
* For a categorical variable – Counts of values
|
* For a categorical variable – Counts of values
|
||||||
* For a continuous variable – Mean and standard deviation
|
* For a continuous variable – Mean and standard deviation
|
||||||
* For an ordinal variable – Median and range or IQR
|
* For an ordinal variable – Median and IQR (default) or range
|
||||||
|
|
||||||
There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported.
|
There is no *nan_policy* argument. *nan* values are omitted from summary statistics for each variable, and the count of *nan* values is reported.
|
||||||
|
|
||||||
@ -35,10 +35,8 @@ def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]):
|
|||||||
:type df: DataFrame
|
:type df: DataFrame
|
||||||
:param cols: Columns in *df* for the variables to summarise
|
:param cols: Columns in *df* for the variables to summarise
|
||||||
:type cols: List[str]
|
:type cols: List[str]
|
||||||
:param ordinal_range: Columns in *df* to treat as ordinal, and report median and range
|
:param ordinal_range: Columns of ordinal variables in *df* to report median and range (rather than IQR)
|
||||||
:type ordinal_range: List[str]
|
:type ordinal_range: List[str]
|
||||||
:param ordinal_iqr: Columns in *df* to treat as ordinal, and report median and IQR
|
|
||||||
:type ordinal_iqr: List[str]
|
|
||||||
|
|
||||||
:rtype: :class:`yli.descriptives.AutoDescriptivesResult`
|
:rtype: :class:`yli.descriptives.AutoDescriptivesResult`
|
||||||
"""
|
"""
|
||||||
@ -49,7 +47,31 @@ def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]):
|
|||||||
for col in cols:
|
for col in cols:
|
||||||
data_cleaned = df[col].dropna()
|
data_cleaned = df[col].dropna()
|
||||||
|
|
||||||
if data_cleaned.dtype in ('bool', 'boolean', 'category', 'object'):
|
if data_cleaned.dtype == 'category' and data_cleaned.cat.ordered and data_cleaned.cat.categories.dtype in ('float64', 'int64', 'Float64', 'Int64'):
|
||||||
|
# Ordinal numeric data
|
||||||
|
data_cleaned = data_cleaned.astype('float64')
|
||||||
|
|
||||||
|
if col in ordinal_range:
|
||||||
|
# Report range
|
||||||
|
result_labels.append((
|
||||||
|
'{}, median (range)'.format(col),
|
||||||
|
'{}, median (range)'.format(col),
|
||||||
|
))
|
||||||
|
result_data.append((
|
||||||
|
'{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.min(), data_cleaned.max()),
|
||||||
|
len(df) - len(data_cleaned)
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
# Report IQR
|
||||||
|
result_labels.append((
|
||||||
|
'{}, median (IQR)'.format(col),
|
||||||
|
'{}, median (IQR)'.format(col),
|
||||||
|
))
|
||||||
|
result_data.append((
|
||||||
|
'{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.quantile(0.25), data_cleaned.quantile(0.75)),
|
||||||
|
len(df) - len(data_cleaned)
|
||||||
|
))
|
||||||
|
elif data_cleaned.dtype in ('bool', 'boolean', 'category', 'object'):
|
||||||
# Categorical data
|
# Categorical data
|
||||||
# FIXME: Sort order
|
# FIXME: Sort order
|
||||||
values = sorted(data_cleaned.unique())
|
values = sorted(data_cleaned.unique())
|
||||||
@ -64,36 +86,15 @@ def auto_descriptives(df, cols, *, ordinal_range=[], ordinal_iqr=[]):
|
|||||||
len(df) - len(data_cleaned)
|
len(df) - len(data_cleaned)
|
||||||
))
|
))
|
||||||
elif data_cleaned.dtype in ('float64', 'int64', 'Float64', 'Int64'):
|
elif data_cleaned.dtype in ('float64', 'int64', 'Float64', 'Int64'):
|
||||||
if col in ordinal_range:
|
# Continuous data
|
||||||
# Ordinal data (report range)
|
result_labels.append((
|
||||||
result_labels.append((
|
'{}, μ (SD)'.format(col),
|
||||||
'{}, median (range)'.format(col),
|
'{}, <i>μ</i> (SD)'.format(col),
|
||||||
'{}, median (range)'.format(col),
|
))
|
||||||
))
|
result_data.append((
|
||||||
result_data.append((
|
'{:.2f} ({:.2f})'.format(data_cleaned.mean(), data_cleaned.std()),
|
||||||
'{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.min(), data_cleaned.max()),
|
len(df) - len(data_cleaned)
|
||||||
len(df) - len(data_cleaned)
|
))
|
||||||
))
|
|
||||||
elif col in ordinal_iqr:
|
|
||||||
# Ordinal data (report IQR)
|
|
||||||
result_labels.append((
|
|
||||||
'{}, median (IQR)'.format(col),
|
|
||||||
'{}, median (IQR)'.format(col),
|
|
||||||
))
|
|
||||||
result_data.append((
|
|
||||||
'{:.2f} ({:.2f}–{:.2f})'.format(data_cleaned.median(), data_cleaned.quantile(0.25), data_cleaned.quantile(0.75)),
|
|
||||||
len(df) - len(data_cleaned)
|
|
||||||
))
|
|
||||||
else:
|
|
||||||
# Continuous data
|
|
||||||
result_labels.append((
|
|
||||||
'{}, μ (SD)'.format(col),
|
|
||||||
'{}, <i>μ</i> (SD)'.format(col),
|
|
||||||
))
|
|
||||||
result_data.append((
|
|
||||||
'{:.2f} ({:.2f})'.format(data_cleaned.mean(), data_cleaned.std()),
|
|
||||||
len(df) - len(data_cleaned)
|
|
||||||
))
|
|
||||||
else:
|
else:
|
||||||
raise Exception('Unsupported dtype for auto_descriptives, {}'.format(df[col].dtype))
|
raise Exception('Unsupported dtype for auto_descriptives, {}'.format(df[col].dtype))
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user