Autodetect ordinal variables in auto_univariable

This commit is contained in:
RunasSudo 2023-02-07 18:49:57 +11:00
parent 68d7a31b53
commit dbebc3b8e9
Signed by: RunasSudo
GPG Key ID: 7234E476BF21C61A
2 changed files with 39 additions and 30 deletions

View File

@ -14,6 +14,11 @@ Most functions take a parameter **nan_policy** to specify how to handle *nan* va
.. autofunction:: yli.utils.check_nan
dtype conventions
-----------------
.. autofunction:: yli.as_ordinal
General result classes
----------------------

View File

@ -452,6 +452,10 @@ def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_cont
# Ensure 2 groups for ind
group1, data1, group2, data2 = as_2groups(df, dep, ind)
# Ensure numeric, factorising if required
data1, _ = as_numeric(data1)
data2, _ = as_numeric(data2)
# Do Mann-Whitney test
# Stata does not perform continuity correction
result = stats.mannwhitneyu(data1, data2, use_continuity=use_continuity, alternative=alternative, method=method)
@ -856,7 +860,7 @@ class AutoBinaryResult:
table = pd.DataFrame(result_data_fmt, index=result_labels_fmt, columns=pd.Index([self.group1, self.group2, '', 'p'], name=self.dep))
return str(table)
def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
def auto_univariable(df, dep, inds, *, nan_policy='warn'):
"""
Automatically compute univariable tests of association for a dichotomous dependent variable
@ -874,8 +878,6 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
:type dep: str
:param inds: Columns in *df* for the independent variables
:type inds: List[str]
:param ordinal: Columns in *df* to treat as ordinal rather than continuous
:type ordinal: List[str]
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
@ -897,7 +899,22 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
# Following this, we pass nan_policy='raise' to assert no NaNs remaining
df_cleaned = check_nan(df, nan_policy, cols=[ind])
if df_cleaned[ind].dtype in ('bool', 'boolean', 'category', 'object'):
if df_cleaned[ind].dtype == 'category' and df_cleaned[ind].cat.ordered and df_cleaned[ind].cat.categories.dtype in ('float64', 'int64', 'Float64', 'Int64'):
# Ordinal numeric data
# Mann-Whitney test
result = mannwhitney(df_cleaned, ind, dep, nan_policy='raise')
result_labels.append((
'{}, median (IQR)'.format(ind),
'{}, median (IQR)'.format(ind),
))
result_data.append((
'{:.2f} ({})'.format(result.med1, result.iqr1.summary()),
'{:.2f} ({})'.format(result.med2, result.iqr2.summary()),
result
))
elif df_cleaned[ind].dtype in ('bool', 'boolean', 'category', 'object'):
# Categorical data
# Pearson chi-squared test
result = chi2(df_cleaned, dep, ind, nan_policy='raise')
@ -914,32 +931,19 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
result
))
elif df_cleaned[ind].dtype in ('float64', 'int64', 'Float64', 'Int64'):
if ind in ordinal:
# Mann-Whitney test
result = mannwhitney(df_cleaned, ind, dep, nan_policy='raise')
# Continuous data
# t test
result = ttest_ind(df_cleaned, ind, dep, nan_policy='raise')
result_labels.append((
'{}, median (IQR)'.format(ind),
'{}, median (IQR)'.format(ind),
))
result_data.append((
'{:.2f} ({})'.format(result.med1, result.iqr1.summary()),
'{:.2f} ({})'.format(result.med2, result.iqr2.summary()),
result
))
else:
# t test
result = ttest_ind(df_cleaned, ind, dep, nan_policy='raise')
result_labels.append((
'{}, μ (SD)'.format(ind),
'{}, <i>μ</i> (SD)'.format(ind),
))
result_data.append((
'{:.2f} ({:.2f})'.format(result.mu1, result.sd1),
'{:.2f} ({:.2f})'.format(result.mu2, result.sd2),
result
))
result_labels.append((
'{}, μ (SD)'.format(ind),
'{}, <i>μ</i> (SD)'.format(ind),
))
result_data.append((
'{:.2f} ({:.2f})'.format(result.mu1, result.sd1),
'{:.2f} ({:.2f})'.format(result.mu2, result.sd2),
result
))
else:
raise Exception('Unsupported independent dtype for auto_univariable, {}'.format(df[ind].dtype))