From c3eef0efea7dc7661ef6d4f611191dd272ce1d9d Mon Sep 17 00:00:00 2001 From: RunasSudo Date: Thu, 10 Nov 2022 18:47:30 +1100 Subject: [PATCH] In auto_univariable, omit NaN only from affected analyses --- yli/sig_tests.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/yli/sig_tests.py b/yli/sig_tests.py index fc3404c..528988b 100644 --- a/yli/sig_tests.py +++ b/yli/sig_tests.py @@ -766,6 +766,8 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'): * For a continuous independent variable – :func:`yli.ttest_ind` * For an ordinal independent variable – :func:`yli.mannwhitney` + If *nan_policy* is *warn* or *omit*, rows with *nan* values are omitted only from the individual tests of association for the missing variables. + :param df: Data to perform the test on :type df: DataFrame :param dep: Column in *df* for the dependent variable (dichotomous) @@ -780,9 +782,8 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'): :rtype: :class:`yli.sig_tests.AutoBinaryResult` """ - # Check for/clean NaNs - # Following this, we pass nan_policy='raise' to assert no NaNs remaining - df = check_nan(df[inds + [dep]], nan_policy) + # Check for/clean NaNs in dependent variable + df = check_nan(df[inds + [dep]], nan_policy, cols=[dep]) # Ensure 2 groups for dep # TODO: Work for non-binary dependent variables? @@ -792,11 +793,15 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'): result_labels = [] for ind in inds: - if df[ind].dtype in ('bool', 'category', 'object'): + # Check for/clean NaNs in independent variable + # Following this, we pass nan_policy='raise' to assert no NaNs remaining + df_cleaned = check_nan(df, nan_policy, cols=[ind]) + + if df_cleaned[ind].dtype in ('bool', 'boolean', 'category', 'object'): # Pearson chi-squared test - result = chi2(df, dep, ind, nan_policy='raise') + result = chi2(df_cleaned, dep, ind, nan_policy='raise') - values = sorted(df[ind].unique()) + values = sorted(df_cleaned[ind].unique()) # Value counts result_labels.append(( @@ -808,10 +813,10 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'): ':'.join(str((data2[ind] == v).sum()) for v in values), result )) - elif df[ind].dtype in ('float64', 'int64', 'Float64', 'Int64'): + elif df_cleaned[ind].dtype in ('float64', 'int64', 'Float64', 'Int64'): if ind in ordinal: # Mann-Whitney test - result = mannwhitney(df, ind, dep, nan_policy='raise') + result = mannwhitney(df_cleaned, ind, dep, nan_policy='raise') result_labels.append(( '{}, median (IQR)'.format(ind), @@ -824,7 +829,7 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'): )) else: # t test - result = ttest_ind(df, ind, dep, nan_policy='raise') + result = ttest_ind(df_cleaned, ind, dep, nan_policy='raise') result_labels.append(( '{}, μ (SD)'.format(ind),