Autodetect ordinal variables in auto_univariable

2023-02-07 18:49:57 +11:00 · 2023-02-07 18:49:57 +11:00 · dbebc3b8e9
commit dbebc3b8e9
parent 68d7a31b53
2 changed files with 39 additions and 30 deletions
--- a/docs/general.rst
+++ b/docs/general.rst
@ -14,6 +14,11 @@ Most functions take a parameter **nan_policy** to specify how to handle *nan* va

 .. autofunction:: yli.utils.check_nan

+dtype conventions
+-----------------
+
+.. autofunction:: yli.as_ordinal
+
 General result classes
 ----------------------

--- a/yli/sig_tests.py
+++ b/yli/sig_tests.py
@ -452,6 +452,10 @@ def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_cont
 	# Ensure 2 groups for ind
 	group1, data1, group2, data2 = as_2groups(df, dep, ind)
 	
+	# Ensure numeric, factorising if required
+	data1, _ = as_numeric(data1)
+	data2, _ = as_numeric(data2)
+	
 	# Do Mann-Whitney test
 	# Stata does not perform continuity correction
 	result = stats.mannwhitneyu(data1, data2, use_continuity=use_continuity, alternative=alternative, method=method)
@ -856,7 +860,7 @@ class AutoBinaryResult:
 		table = pd.DataFrame(result_data_fmt, index=result_labels_fmt, columns=pd.Index([self.group1, self.group2, '', 'p'], name=self.dep))
 		return str(table)

-def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
+def auto_univariable(df, dep, inds, *, nan_policy='warn'):
 	"""
 	Automatically compute univariable tests of association for a dichotomous dependent variable
 	
@ -874,8 +878,6 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
 	:type dep: str
 	:param inds: Columns in *df* for the independent variables
 	:type inds: List[str]
-	:param ordinal: Columns in *df* to treat as ordinal rather than continuous
-	:type ordinal: List[str]
 	:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
 	:type nan_policy: str
 	
@ -897,7 +899,22 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
 		# Following this, we pass nan_policy='raise' to assert no NaNs remaining
 		df_cleaned = check_nan(df, nan_policy, cols=[ind])
 		
-		if df_cleaned[ind].dtype in ('bool', 'boolean', 'category', 'object'):
+		if df_cleaned[ind].dtype == 'category' and df_cleaned[ind].cat.ordered and df_cleaned[ind].cat.categories.dtype in ('float64', 'int64', 'Float64', 'Int64'):
+			# Ordinal numeric data
+			# Mann-Whitney test
+			result = mannwhitney(df_cleaned, ind, dep, nan_policy='raise')
+			
+			result_labels.append((
+				'{}, median (IQR)'.format(ind),
+				'{}, median (IQR)'.format(ind),
+			))
+			result_data.append((
+				'{:.2f} ({})'.format(result.med1, result.iqr1.summary()),
+				'{:.2f} ({})'.format(result.med2, result.iqr2.summary()),
+				result
+			))
+		elif df_cleaned[ind].dtype in ('bool', 'boolean', 'category', 'object'):
+			# Categorical data
 			# Pearson chi-squared test
 			result = chi2(df_cleaned, dep, ind, nan_policy='raise')
 			
@ -914,32 +931,19 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
 				result
 			))
 		elif df_cleaned[ind].dtype in ('float64', 'int64', 'Float64', 'Int64'):
-			if ind in ordinal:
-				# Mann-Whitney test
-				result = mannwhitney(df_cleaned, ind, dep, nan_policy='raise')
-				
-				result_labels.append((
-					'{}, median (IQR)'.format(ind),
-					'{}, median (IQR)'.format(ind),
-				))
-				result_data.append((
-					'{:.2f} ({})'.format(result.med1, result.iqr1.summary()),
-					'{:.2f} ({})'.format(result.med2, result.iqr2.summary()),
-					result
-				))
-			else:
-				# t test
-				result = ttest_ind(df_cleaned, ind, dep, nan_policy='raise')
-				
-				result_labels.append((
-					'{}, μ (SD)'.format(ind),
-					'{}, <i>μ</i> (SD)'.format(ind),
-				))
-				result_data.append((
-					'{:.2f} ({:.2f})'.format(result.mu1, result.sd1),
-					'{:.2f} ({:.2f})'.format(result.mu2, result.sd2),
-					result
-				))
+			# Continuous data
+			# t test
+			result = ttest_ind(df_cleaned, ind, dep, nan_policy='raise')
+			
+			result_labels.append((
+				'{}, μ (SD)'.format(ind),
+				'{}, <i>μ</i> (SD)'.format(ind),
+			))
+			result_data.append((
+				'{:.2f} ({:.2f})'.format(result.mu1, result.sd1),
+				'{:.2f} ({:.2f})'.format(result.mu2, result.sd2),
+				result
+			))
 		else:
 			raise Exception('Unsupported independent dtype for auto_univariable, {}'.format(df[ind].dtype))