In auto_univariable, omit NaN only from affected analyses
This commit is contained in:
parent
f80afd0e80
commit
c3eef0efea
@ -766,6 +766,8 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
|
|||||||
* For a continuous independent variable – :func:`yli.ttest_ind`
|
* For a continuous independent variable – :func:`yli.ttest_ind`
|
||||||
* For an ordinal independent variable – :func:`yli.mannwhitney`
|
* For an ordinal independent variable – :func:`yli.mannwhitney`
|
||||||
|
|
||||||
|
If *nan_policy* is *warn* or *omit*, rows with *nan* values are omitted only from the individual tests of association for the missing variables.
|
||||||
|
|
||||||
:param df: Data to perform the test on
|
:param df: Data to perform the test on
|
||||||
:type df: DataFrame
|
:type df: DataFrame
|
||||||
:param dep: Column in *df* for the dependent variable (dichotomous)
|
:param dep: Column in *df* for the dependent variable (dichotomous)
|
||||||
@ -780,9 +782,8 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
|
|||||||
:rtype: :class:`yli.sig_tests.AutoBinaryResult`
|
:rtype: :class:`yli.sig_tests.AutoBinaryResult`
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Check for/clean NaNs
|
# Check for/clean NaNs in dependent variable
|
||||||
# Following this, we pass nan_policy='raise' to assert no NaNs remaining
|
df = check_nan(df[inds + [dep]], nan_policy, cols=[dep])
|
||||||
df = check_nan(df[inds + [dep]], nan_policy)
|
|
||||||
|
|
||||||
# Ensure 2 groups for dep
|
# Ensure 2 groups for dep
|
||||||
# TODO: Work for non-binary dependent variables?
|
# TODO: Work for non-binary dependent variables?
|
||||||
@ -792,11 +793,15 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
|
|||||||
result_labels = []
|
result_labels = []
|
||||||
|
|
||||||
for ind in inds:
|
for ind in inds:
|
||||||
if df[ind].dtype in ('bool', 'category', 'object'):
|
# Check for/clean NaNs in independent variable
|
||||||
|
# Following this, we pass nan_policy='raise' to assert no NaNs remaining
|
||||||
|
df_cleaned = check_nan(df, nan_policy, cols=[ind])
|
||||||
|
|
||||||
|
if df_cleaned[ind].dtype in ('bool', 'boolean', 'category', 'object'):
|
||||||
# Pearson chi-squared test
|
# Pearson chi-squared test
|
||||||
result = chi2(df, dep, ind, nan_policy='raise')
|
result = chi2(df_cleaned, dep, ind, nan_policy='raise')
|
||||||
|
|
||||||
values = sorted(df[ind].unique())
|
values = sorted(df_cleaned[ind].unique())
|
||||||
|
|
||||||
# Value counts
|
# Value counts
|
||||||
result_labels.append((
|
result_labels.append((
|
||||||
@ -808,10 +813,10 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
|
|||||||
':'.join(str((data2[ind] == v).sum()) for v in values),
|
':'.join(str((data2[ind] == v).sum()) for v in values),
|
||||||
result
|
result
|
||||||
))
|
))
|
||||||
elif df[ind].dtype in ('float64', 'int64', 'Float64', 'Int64'):
|
elif df_cleaned[ind].dtype in ('float64', 'int64', 'Float64', 'Int64'):
|
||||||
if ind in ordinal:
|
if ind in ordinal:
|
||||||
# Mann-Whitney test
|
# Mann-Whitney test
|
||||||
result = mannwhitney(df, ind, dep, nan_policy='raise')
|
result = mannwhitney(df_cleaned, ind, dep, nan_policy='raise')
|
||||||
|
|
||||||
result_labels.append((
|
result_labels.append((
|
||||||
'{}, median (IQR)'.format(ind),
|
'{}, median (IQR)'.format(ind),
|
||||||
@ -824,7 +829,7 @@ def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
|
|||||||
))
|
))
|
||||||
else:
|
else:
|
||||||
# t test
|
# t test
|
||||||
result = ttest_ind(df, ind, dep, nan_policy='raise')
|
result = ttest_ind(df_cleaned, ind, dep, nan_policy='raise')
|
||||||
|
|
||||||
result_labels.append((
|
result_labels.append((
|
||||||
'{}, μ (SD)'.format(ind),
|
'{}, μ (SD)'.format(ind),
|
||||||
|
Loading…
Reference in New Issue
Block a user