Autodetect ordinal variables in auto_univariable

This commit is contained in:
RunasSudo 2023-02-07 18:49:57 +11:00
parent 68d7a31b53
commit dbebc3b8e9
Signed by: RunasSudo
GPG Key ID: 7234E476BF21C61A
2 changed files with 39 additions and 30 deletions

View File

@ -14,6 +14,11 @@ Most functions take a parameter **nan_policy** to specify how to handle *nan* va
.. autofunction:: yli.utils.check_nan .. autofunction:: yli.utils.check_nan
dtype conventions
-----------------
.. autofunction:: yli.as_ordinal
General result classes General result classes
---------------------- ----------------------

View File

@ -452,6 +452,10 @@ def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_cont
# Ensure 2 groups for ind # Ensure 2 groups for ind
group1, data1, group2, data2 = as_2groups(df, dep, ind) group1, data1, group2, data2 = as_2groups(df, dep, ind)
# Ensure numeric, factorising if required
data1, _ = as_numeric(data1)
data2, _ = as_numeric(data2)
# Do Mann-Whitney test # Do Mann-Whitney test
# Stata does not perform continuity correction # Stata does not perform continuity correction
result = stats.mannwhitneyu(data1, data2, use_continuity=use_continuity, alternative=alternative, method=method) result = stats.mannwhitneyu(data1, data2, use_continuity=use_continuity, alternative=alternative, method=method)
@ -856,7 +860,7 @@ class AutoBinaryResult:
table = pd.DataFrame(result_data_fmt, index=result_labels_fmt, columns=pd.Index([self.group1, self.group2, '', 'p'], name=self.dep)) table = pd.DataFrame(result_data_fmt, index=result_labels_fmt, columns=pd.Index([self.group1, self.group2, '', 'p'], name=self.dep))
return str(table) return str(table)
def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'): def auto_univariable(df, dep, inds, *, nan_policy='warn'):
""" """
Automatically compute univariable tests of association for a dichotomous dependent variable Automatically compute univariable tests of association for a dichotomous dependent variable
@ -874,8 +878,6 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
:type dep: str :type dep: str
:param inds: Columns in *df* for the independent variables :param inds: Columns in *df* for the independent variables
:type inds: List[str] :type inds: List[str]
:param ordinal: Columns in *df* to treat as ordinal rather than continuous
:type ordinal: List[str]
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str :type nan_policy: str
@ -897,7 +899,22 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
# Following this, we pass nan_policy='raise' to assert no NaNs remaining # Following this, we pass nan_policy='raise' to assert no NaNs remaining
df_cleaned = check_nan(df, nan_policy, cols=[ind]) df_cleaned = check_nan(df, nan_policy, cols=[ind])
if df_cleaned[ind].dtype in ('bool', 'boolean', 'category', 'object'): if df_cleaned[ind].dtype == 'category' and df_cleaned[ind].cat.ordered and df_cleaned[ind].cat.categories.dtype in ('float64', 'int64', 'Float64', 'Int64'):
# Ordinal numeric data
# Mann-Whitney test
result = mannwhitney(df_cleaned, ind, dep, nan_policy='raise')
result_labels.append((
'{}, median (IQR)'.format(ind),
'{}, median (IQR)'.format(ind),
))
result_data.append((
'{:.2f} ({})'.format(result.med1, result.iqr1.summary()),
'{:.2f} ({})'.format(result.med2, result.iqr2.summary()),
result
))
elif df_cleaned[ind].dtype in ('bool', 'boolean', 'category', 'object'):
# Categorical data
# Pearson chi-squared test # Pearson chi-squared test
result = chi2(df_cleaned, dep, ind, nan_policy='raise') result = chi2(df_cleaned, dep, ind, nan_policy='raise')
@ -914,32 +931,19 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
result result
)) ))
elif df_cleaned[ind].dtype in ('float64', 'int64', 'Float64', 'Int64'): elif df_cleaned[ind].dtype in ('float64', 'int64', 'Float64', 'Int64'):
if ind in ordinal: # Continuous data
# Mann-Whitney test # t test
result = mannwhitney(df_cleaned, ind, dep, nan_policy='raise') result = ttest_ind(df_cleaned, ind, dep, nan_policy='raise')
result_labels.append(( result_labels.append((
'{}, median (IQR)'.format(ind), '{}, μ (SD)'.format(ind),
'{}, median (IQR)'.format(ind), '{}, <i>μ</i> (SD)'.format(ind),
)) ))
result_data.append(( result_data.append((
'{:.2f} ({})'.format(result.med1, result.iqr1.summary()), '{:.2f} ({:.2f})'.format(result.mu1, result.sd1),
'{:.2f} ({})'.format(result.med2, result.iqr2.summary()), '{:.2f} ({:.2f})'.format(result.mu2, result.sd2),
result result
)) ))
else:
# t test
result = ttest_ind(df_cleaned, ind, dep, nan_policy='raise')
result_labels.append((
'{}, μ (SD)'.format(ind),
'{}, <i>μ</i> (SD)'.format(ind),
))
result_data.append((
'{:.2f} ({:.2f})'.format(result.mu1, result.sd1),
'{:.2f} ({:.2f})'.format(result.mu2, result.sd2),
result
))
else: else:
raise Exception('Unsupported independent dtype for auto_univariable, {}'.format(df[ind].dtype)) raise Exception('Unsupported independent dtype for auto_univariable, {}'.format(df[ind].dtype))