Autodetect ordinal variables in auto_univariable
This commit is contained in:
parent
68d7a31b53
commit
dbebc3b8e9
@ -14,6 +14,11 @@ Most functions take a parameter **nan_policy** to specify how to handle *nan* va
|
||||
|
||||
.. autofunction:: yli.utils.check_nan
|
||||
|
||||
dtype conventions
|
||||
-----------------
|
||||
|
||||
.. autofunction:: yli.as_ordinal
|
||||
|
||||
General result classes
|
||||
----------------------
|
||||
|
||||
|
@ -452,6 +452,10 @@ def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_cont
|
||||
# Ensure 2 groups for ind
|
||||
group1, data1, group2, data2 = as_2groups(df, dep, ind)
|
||||
|
||||
# Ensure numeric, factorising if required
|
||||
data1, _ = as_numeric(data1)
|
||||
data2, _ = as_numeric(data2)
|
||||
|
||||
# Do Mann-Whitney test
|
||||
# Stata does not perform continuity correction
|
||||
result = stats.mannwhitneyu(data1, data2, use_continuity=use_continuity, alternative=alternative, method=method)
|
||||
@ -856,7 +860,7 @@ class AutoBinaryResult:
|
||||
table = pd.DataFrame(result_data_fmt, index=result_labels_fmt, columns=pd.Index([self.group1, self.group2, '', 'p'], name=self.dep))
|
||||
return str(table)
|
||||
|
||||
def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
|
||||
def auto_univariable(df, dep, inds, *, nan_policy='warn'):
|
||||
"""
|
||||
Automatically compute univariable tests of association for a dichotomous dependent variable
|
||||
|
||||
@ -874,8 +878,6 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
|
||||
:type dep: str
|
||||
:param inds: Columns in *df* for the independent variables
|
||||
:type inds: List[str]
|
||||
:param ordinal: Columns in *df* to treat as ordinal rather than continuous
|
||||
:type ordinal: List[str]
|
||||
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
|
||||
:type nan_policy: str
|
||||
|
||||
@ -897,7 +899,22 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
|
||||
# Following this, we pass nan_policy='raise' to assert no NaNs remaining
|
||||
df_cleaned = check_nan(df, nan_policy, cols=[ind])
|
||||
|
||||
if df_cleaned[ind].dtype in ('bool', 'boolean', 'category', 'object'):
|
||||
if df_cleaned[ind].dtype == 'category' and df_cleaned[ind].cat.ordered and df_cleaned[ind].cat.categories.dtype in ('float64', 'int64', 'Float64', 'Int64'):
|
||||
# Ordinal numeric data
|
||||
# Mann-Whitney test
|
||||
result = mannwhitney(df_cleaned, ind, dep, nan_policy='raise')
|
||||
|
||||
result_labels.append((
|
||||
'{}, median (IQR)'.format(ind),
|
||||
'{}, median (IQR)'.format(ind),
|
||||
))
|
||||
result_data.append((
|
||||
'{:.2f} ({})'.format(result.med1, result.iqr1.summary()),
|
||||
'{:.2f} ({})'.format(result.med2, result.iqr2.summary()),
|
||||
result
|
||||
))
|
||||
elif df_cleaned[ind].dtype in ('bool', 'boolean', 'category', 'object'):
|
||||
# Categorical data
|
||||
# Pearson chi-squared test
|
||||
result = chi2(df_cleaned, dep, ind, nan_policy='raise')
|
||||
|
||||
@ -914,32 +931,19 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
|
||||
result
|
||||
))
|
||||
elif df_cleaned[ind].dtype in ('float64', 'int64', 'Float64', 'Int64'):
|
||||
if ind in ordinal:
|
||||
# Mann-Whitney test
|
||||
result = mannwhitney(df_cleaned, ind, dep, nan_policy='raise')
|
||||
|
||||
result_labels.append((
|
||||
'{}, median (IQR)'.format(ind),
|
||||
'{}, median (IQR)'.format(ind),
|
||||
))
|
||||
result_data.append((
|
||||
'{:.2f} ({})'.format(result.med1, result.iqr1.summary()),
|
||||
'{:.2f} ({})'.format(result.med2, result.iqr2.summary()),
|
||||
result
|
||||
))
|
||||
else:
|
||||
# t test
|
||||
result = ttest_ind(df_cleaned, ind, dep, nan_policy='raise')
|
||||
|
||||
result_labels.append((
|
||||
'{}, μ (SD)'.format(ind),
|
||||
'{}, <i>μ</i> (SD)'.format(ind),
|
||||
))
|
||||
result_data.append((
|
||||
'{:.2f} ({:.2f})'.format(result.mu1, result.sd1),
|
||||
'{:.2f} ({:.2f})'.format(result.mu2, result.sd2),
|
||||
result
|
||||
))
|
||||
# Continuous data
|
||||
# t test
|
||||
result = ttest_ind(df_cleaned, ind, dep, nan_policy='raise')
|
||||
|
||||
result_labels.append((
|
||||
'{}, μ (SD)'.format(ind),
|
||||
'{}, <i>μ</i> (SD)'.format(ind),
|
||||
))
|
||||
result_data.append((
|
||||
'{:.2f} ({:.2f})'.format(result.mu1, result.sd1),
|
||||
'{:.2f} ({:.2f})'.format(result.mu2, result.sd2),
|
||||
result
|
||||
))
|
||||
else:
|
||||
raise Exception('Unsupported independent dtype for auto_univariable, {}'.format(df[ind].dtype))
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user