diff --git a/docs/general.rst b/docs/general.rst
index 4209d16..dc87e24 100644
--- a/docs/general.rst
+++ b/docs/general.rst
@@ -14,6 +14,11 @@ Most functions take a parameter **nan_policy** to specify how to handle *nan* va
.. autofunction:: yli.utils.check_nan
+dtype conventions
+-----------------
+
+.. autofunction:: yli.as_ordinal
+
General result classes
----------------------
diff --git a/yli/sig_tests.py b/yli/sig_tests.py
index 65b835b..6365f92 100644
--- a/yli/sig_tests.py
+++ b/yli/sig_tests.py
@@ -452,6 +452,10 @@ def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_cont
# Ensure 2 groups for ind
group1, data1, group2, data2 = as_2groups(df, dep, ind)
+ # Ensure numeric, factorising if required
+ data1, _ = as_numeric(data1)
+ data2, _ = as_numeric(data2)
+
# Do Mann-Whitney test
# Stata does not perform continuity correction
result = stats.mannwhitneyu(data1, data2, use_continuity=use_continuity, alternative=alternative, method=method)
@@ -856,7 +860,7 @@ class AutoBinaryResult:
table = pd.DataFrame(result_data_fmt, index=result_labels_fmt, columns=pd.Index([self.group1, self.group2, '', 'p'], name=self.dep))
return str(table)
-def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
+def auto_univariable(df, dep, inds, *, nan_policy='warn'):
"""
Automatically compute univariable tests of association for a dichotomous dependent variable
@@ -874,8 +878,6 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
:type dep: str
:param inds: Columns in *df* for the independent variables
:type inds: List[str]
- :param ordinal: Columns in *df* to treat as ordinal rather than continuous
- :type ordinal: List[str]
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
@@ -897,7 +899,22 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
# Following this, we pass nan_policy='raise' to assert no NaNs remaining
df_cleaned = check_nan(df, nan_policy, cols=[ind])
- if df_cleaned[ind].dtype in ('bool', 'boolean', 'category', 'object'):
+ if df_cleaned[ind].dtype == 'category' and df_cleaned[ind].cat.ordered and df_cleaned[ind].cat.categories.dtype in ('float64', 'int64', 'Float64', 'Int64'):
+ # Ordinal numeric data
+ # Mann-Whitney test
+ result = mannwhitney(df_cleaned, ind, dep, nan_policy='raise')
+
+ result_labels.append((
+ '{}, median (IQR)'.format(ind),
+ '{}, median (IQR)'.format(ind),
+ ))
+ result_data.append((
+ '{:.2f} ({})'.format(result.med1, result.iqr1.summary()),
+ '{:.2f} ({})'.format(result.med2, result.iqr2.summary()),
+ result
+ ))
+ elif df_cleaned[ind].dtype in ('bool', 'boolean', 'category', 'object'):
+ # Categorical data
# Pearson chi-squared test
result = chi2(df_cleaned, dep, ind, nan_policy='raise')
@@ -914,32 +931,19 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
result
))
elif df_cleaned[ind].dtype in ('float64', 'int64', 'Float64', 'Int64'):
- if ind in ordinal:
- # Mann-Whitney test
- result = mannwhitney(df_cleaned, ind, dep, nan_policy='raise')
-
- result_labels.append((
- '{}, median (IQR)'.format(ind),
- '{}, median (IQR)'.format(ind),
- ))
- result_data.append((
- '{:.2f} ({})'.format(result.med1, result.iqr1.summary()),
- '{:.2f} ({})'.format(result.med2, result.iqr2.summary()),
- result
- ))
- else:
- # t test
- result = ttest_ind(df_cleaned, ind, dep, nan_policy='raise')
-
- result_labels.append((
- '{}, μ (SD)'.format(ind),
- '{}, μ (SD)'.format(ind),
- ))
- result_data.append((
- '{:.2f} ({:.2f})'.format(result.mu1, result.sd1),
- '{:.2f} ({:.2f})'.format(result.mu2, result.sd2),
- result
- ))
+ # Continuous data
+ # t test
+ result = ttest_ind(df_cleaned, ind, dep, nan_policy='raise')
+
+ result_labels.append((
+ '{}, μ (SD)'.format(ind),
+ '{}, μ (SD)'.format(ind),
+ ))
+ result_data.append((
+ '{:.2f} ({:.2f})'.format(result.mu1, result.sd1),
+ '{:.2f} ({:.2f})'.format(result.mu2, result.sd2),
+ result
+ ))
else:
raise Exception('Unsupported independent dtype for auto_univariable, {}'.format(df[ind].dtype))