From c3eef0efea7dc7661ef6d4f611191dd272ce1d9d Mon Sep 17 00:00:00 2001
From: RunasSudo <runassudo@yingtongli.me>
Date: Thu, 10 Nov 2022 18:47:30 +1100
Subject: [PATCH] In auto_univariable, omit NaN only from affected analyses

---
 yli/sig_tests.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/yli/sig_tests.py b/yli/sig_tests.py
index fc3404c..528988b 100644
--- a/yli/sig_tests.py
+++ b/yli/sig_tests.py
@@ -766,6 +766,8 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
 	* For a continuous independent variable – :func:`yli.ttest_ind`
 	* For an ordinal independent variable – :func:`yli.mannwhitney`
 	
+	If *nan_policy* is *warn* or *omit*, rows with *nan* values are omitted only from the individual tests of association for the missing variables.
+	
 	:param df: Data to perform the test on
 	:type df: DataFrame
 	:param dep: Column in *df* for the dependent variable (dichotomous)
@@ -780,9 +782,8 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
 	:rtype: :class:`yli.sig_tests.AutoBinaryResult`
 	"""
 	
-	# Check for/clean NaNs
-	# Following this, we pass nan_policy='raise' to assert no NaNs remaining
-	df = check_nan(df[inds + [dep]], nan_policy)
+	# Check for/clean NaNs in dependent variable
+	df = check_nan(df[inds + [dep]], nan_policy, cols=[dep])
 	
 	# Ensure 2 groups for dep
 	# TODO: Work for non-binary dependent variables?
@@ -792,11 +793,15 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
 	result_labels = []
 	
 	for ind in inds:
-		if df[ind].dtype in ('bool', 'category', 'object'):
+		# Check for/clean NaNs in independent variable
+		# Following this, we pass nan_policy='raise' to assert no NaNs remaining
+		df_cleaned = check_nan(df, nan_policy, cols=[ind])
+		
+		if df_cleaned[ind].dtype in ('bool', 'boolean', 'category', 'object'):
 			# Pearson chi-squared test
-			result = chi2(df, dep, ind, nan_policy='raise')
+			result = chi2(df_cleaned, dep, ind, nan_policy='raise')
 			
-			values = sorted(df[ind].unique())
+			values = sorted(df_cleaned[ind].unique())
 			
 			# Value counts
 			result_labels.append((
@@ -808,10 +813,10 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
 				':'.join(str((data2[ind] == v).sum()) for v in values),
 				result
 			))
-		elif df[ind].dtype in ('float64', 'int64', 'Float64', 'Int64'):
+		elif df_cleaned[ind].dtype in ('float64', 'int64', 'Float64', 'Int64'):
 			if ind in ordinal:
 				# Mann-Whitney test
-				result = mannwhitney(df, ind, dep, nan_policy='raise')
+				result = mannwhitney(df_cleaned, ind, dep, nan_policy='raise')
 				
 				result_labels.append((
 					'{}, median (IQR)'.format(ind),
@@ -824,7 +829,7 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
 				))
 			else:
 				# t test
-				result = ttest_ind(df, ind, dep, nan_policy='raise')
+				result = ttest_ind(df_cleaned, ind, dep, nan_policy='raise')
 				
 				result_labels.append((
 					'{}, μ (SD)'.format(ind),