From dbebc3b8e9ecf2e699766c9137eabc3812f3ad23 Mon Sep 17 00:00:00 2001 From: RunasSudo Date: Tue, 7 Feb 2023 18:49:57 +1100 Subject: [PATCH] Autodetect ordinal variables in auto_univariable --- docs/general.rst | 5 ++++ yli/sig_tests.py | 64 +++++++++++++++++++++++++----------------------- 2 files changed, 39 insertions(+), 30 deletions(-) diff --git a/docs/general.rst b/docs/general.rst index 4209d16..dc87e24 100644 --- a/docs/general.rst +++ b/docs/general.rst @@ -14,6 +14,11 @@ Most functions take a parameter **nan_policy** to specify how to handle *nan* va .. autofunction:: yli.utils.check_nan +dtype conventions +----------------- + +.. autofunction:: yli.as_ordinal + General result classes ---------------------- diff --git a/yli/sig_tests.py b/yli/sig_tests.py index 65b835b..6365f92 100644 --- a/yli/sig_tests.py +++ b/yli/sig_tests.py @@ -452,6 +452,10 @@ def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_cont # Ensure 2 groups for ind group1, data1, group2, data2 = as_2groups(df, dep, ind) + # Ensure numeric, factorising if required + data1, _ = as_numeric(data1) + data2, _ = as_numeric(data2) + # Do Mann-Whitney test # Stata does not perform continuity correction result = stats.mannwhitneyu(data1, data2, use_continuity=use_continuity, alternative=alternative, method=method) @@ -856,7 +860,7 @@ class AutoBinaryResult: table = pd.DataFrame(result_data_fmt, index=result_labels_fmt, columns=pd.Index([self.group1, self.group2, '', 'p'], name=self.dep)) return str(table) -def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'): +def auto_univariable(df, dep, inds, *, nan_policy='warn'): """ Automatically compute univariable tests of association for a dichotomous dependent variable @@ -874,8 +878,6 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'): :type dep: str :param inds: Columns in *df* for the independent variables :type inds: List[str] - :param ordinal: Columns in *df* to treat as ordinal rather than continuous - :type ordinal: List[str] :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) :type nan_policy: str @@ -897,7 +899,22 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'): # Following this, we pass nan_policy='raise' to assert no NaNs remaining df_cleaned = check_nan(df, nan_policy, cols=[ind]) - if df_cleaned[ind].dtype in ('bool', 'boolean', 'category', 'object'): + if df_cleaned[ind].dtype == 'category' and df_cleaned[ind].cat.ordered and df_cleaned[ind].cat.categories.dtype in ('float64', 'int64', 'Float64', 'Int64'): + # Ordinal numeric data + # Mann-Whitney test + result = mannwhitney(df_cleaned, ind, dep, nan_policy='raise') + + result_labels.append(( + '{}, median (IQR)'.format(ind), + '{}, median (IQR)'.format(ind), + )) + result_data.append(( + '{:.2f} ({})'.format(result.med1, result.iqr1.summary()), + '{:.2f} ({})'.format(result.med2, result.iqr2.summary()), + result + )) + elif df_cleaned[ind].dtype in ('bool', 'boolean', 'category', 'object'): + # Categorical data # Pearson chi-squared test result = chi2(df_cleaned, dep, ind, nan_policy='raise') @@ -914,32 +931,19 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'): result )) elif df_cleaned[ind].dtype in ('float64', 'int64', 'Float64', 'Int64'): - if ind in ordinal: - # Mann-Whitney test - result = mannwhitney(df_cleaned, ind, dep, nan_policy='raise') - - result_labels.append(( - '{}, median (IQR)'.format(ind), - '{}, median (IQR)'.format(ind), - )) - result_data.append(( - '{:.2f} ({})'.format(result.med1, result.iqr1.summary()), - '{:.2f} ({})'.format(result.med2, result.iqr2.summary()), - result - )) - else: - # t test - result = ttest_ind(df_cleaned, ind, dep, nan_policy='raise') - - result_labels.append(( - '{}, μ (SD)'.format(ind), - '{}, μ (SD)'.format(ind), - )) - result_data.append(( - '{:.2f} ({:.2f})'.format(result.mu1, result.sd1), - '{:.2f} ({:.2f})'.format(result.mu2, result.sd2), - result - )) + # Continuous data + # t test + result = ttest_ind(df_cleaned, ind, dep, nan_policy='raise') + + result_labels.append(( + '{}, μ (SD)'.format(ind), + '{}, μ (SD)'.format(ind), + )) + result_data.append(( + '{:.2f} ({:.2f})'.format(result.mu1, result.sd1), + '{:.2f} ({:.2f})'.format(result.mu2, result.sd2), + result + )) else: raise Exception('Unsupported independent dtype for auto_univariable, {}'.format(df[ind].dtype))