diff --git a/docs/sig_tests.rst b/docs/sig_tests.rst index c5993da..8330f12 100644 --- a/docs/sig_tests.rst +++ b/docs/sig_tests.rst @@ -6,6 +6,8 @@ Functions .. autofunction:: yli.anova_oneway +.. autofunction:: yli.auto_univariable + .. autofunction:: yli.chi2 .. autofunction:: yli.mannwhitney @@ -17,6 +19,9 @@ Functions Result classes -------------- +.. autoclass:: yli.sig_tests.AutoBinaryResult + :members: + .. autoclass:: yli.sig_tests.BrunnerMunzelResult :members: diff --git a/yli/__init__.py b/yli/__init__.py index 1afea8d..89415d7 100644 --- a/yli/__init__.py +++ b/yli/__init__.py @@ -19,7 +19,7 @@ from .config import config from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted from .regress import PenalisedLogit, logit_then_regress, regress, vif -from .sig_tests import anova_oneway, chi2, mannwhitney, pearsonr, ttest_ind +from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, ttest_ind def reload_me(): import importlib diff --git a/yli/sig_tests.py b/yli/sig_tests.py index 1644b9c..b461cfc 100644 --- a/yli/sig_tests.py +++ b/yli/sig_tests.py @@ -98,6 +98,18 @@ class TTestResult: """ return '{}\n\nt({:.0f}) = {:.2f}; p {}\nΔμ ({:g}% CI) = {}, {}'.format(self._comparison_table(False), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION), (1-config.alpha)*100, self.delta.summary(), self.delta_direction) + + def summary_short(self, html): + """ + Return a stringified summary of the *t* test (*t* statistic only) + + :rtype: str + """ + + if html: + return 't({:.0f}) = {:.2f}'.format(self.dof, self.statistic) + else: + return 't({:.0f}) = {:.2f}'.format(self.dof, self.statistic) def ttest_ind(df, dep, ind, *, nan_policy='warn'): """ @@ -336,6 +348,18 @@ class MannWhitneyResult: return line1 + '\n' + self.brunnermunzel.summary() else: return line1 + + def summary_short(self, html): + """ + Return a stringified summary of the Mann–Whitney test (*U* statistic only) + + :rtype: str + """ + + if html: + return 'U = {:.1f}'.format(self.statistic) + else: + return 'U = {:.1f}'.format(self.statistic) class BrunnerMunzelResult: """ @@ -422,7 +446,7 @@ def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_cont # Check for/clean NaNs df = check_nan(df[[ind, dep]], nan_policy) - # Convert pandas nullable types for independent variables as this breaks statsmodels + # Convert pandas nullable types for independent variables as this breaks mannwhitneyu df = convert_pandas_nullable(df) # Ensure 2 groups for ind @@ -506,6 +530,18 @@ class PearsonChiSquaredResult: else: return '{}\n\nχ²({}) = {:.2f}; p {}'.format( self.ct, self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION)) + + def summary_short(self, html): + """ + Return a stringified summary of the *χ*:sup:`2` test (*χ*:sup:`2` statistic only) + + :rtype: str + """ + + if html: + return 'χ2({}) = {:.2f}'.format(self.dof, self.statistic) + else: + return 'χ²({}) = {:.2f}'.format(self.dof, self.statistic) def chi2(df, dep, ind, *, nan_policy='warn'): """ @@ -662,3 +698,144 @@ def pearsonr(df, dep, ind, *, nan_policy='warn'): ci = result.confidence_interval() return PearsonRResult(statistic=Estimate(result.statistic, ci.low, ci.high), pvalue=result.pvalue) + +# ---------------------------- +# Automatic selection of tests + +class AutoBinaryResult: + """ + Result of automatically computed univariable tests of association for a dichotomous dependent variable + + See :func:`yli.auto_univariable`. + + Results data stored within instances of this class is not intended to be directly accessed. + """ + + def __init__(self, *, dep, group1, group2, result_data, result_labels): + #: Name of the dependent variable (*str*) + self.dep = dep + + #: Name of the first group (*str*) + self.group1 = group1 + #: Name of the second group (*str*) + self.group2 = group2 + + # List of tuples (first group summary, second group summary, test result) + self._result_data = result_data + # List of row labels for the independente variables + self._result_labels = result_labels + + def __repr__(self): + if config.repr_is_summary: + return self.summary() + return super().__repr__() + + def _repr_html_(self): + result = ''.format(self.dep, self.group1, self.group2) + + for data, label in zip(self._result_data, self._result_labels): + result += ''.format(label[1], data[0], data[1], data[2].summary_short(True), fmt_p(data[2].pvalue, PValueStyle.TABULAR | PValueStyle.HTML)) + + result += '
{}{}{}p
{}{}{}{}{}
' + return result + + def summary(self): + """ + Return a stringified summary of the tests of association + + :rtype: str + """ + + # Format data for output + result_data_fmt = [ + [r[0], r[1], r[2].summary_short(False), fmt_p(r[2].pvalue, PValueStyle.TABULAR)] + for r in self._result_data + ] + result_labels_fmt = [r[0] for r in self._result_labels] + + table = pd.DataFrame(result_data_fmt, index=result_labels_fmt, columns=pd.Index([self.group1, self.group2, '', 'p'], name=self.dep)) + return str(table) + +def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'): + """ + Automatically compute univariable tests of association for a dichotomous dependent variable + + The tests performed are: + + * For a dichotomous independent variable – :func:`yli.chi2` + * For a continuous independent variable – :func:`yli.ttest_ind` + * For an ordinal independent variable – :func:`yli.mannwhitney` + + :param df: Data to perform the test on + :type df: DataFrame + :param dep: Column in *df* for the dependent variable (dichotomous) + :type dep: str + :param inds: Columns in *df* for the independent variables + :type inds: List[str] + :param ordinal: Columns in *df* to treat as ordinal rather than continuous + :type ordinal: List[str] + :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) + :type nan_policy: str + + :rtype: :class:`yli.sig_tests.AutoBinaryResult` + """ + + # Check for/clean NaNs + # Following this, we pass nan_policy='raise' to assert no NaNs remaining + df = check_nan(df[inds + [dep]], nan_policy) + + # Ensure 2 groups for dep + # TODO: Work for non-binary dependent variables? + group1, data1, group2, data2 = as_2groups(df, inds, dep) + + result_data = [] + result_labels = [] + + for ind in inds: + if df[ind].dtype in ('bool', 'category', 'object'): + # Pearson chi-squared test + result = chi2(df, dep, ind, nan_policy='raise') + + values = sorted(df[ind].unique()) + + # Value counts + result_labels.append(( + '{}, {}'.format(ind, ':'.join(str(v) for v in values)), + '{}, {}'.format(ind, ':'.join(str(v) for v in values)), + )) + result_data.append(( + ':'.join(str((data1[ind] == v).sum()) for v in values), + ':'.join(str((data2[ind] == v).sum()) for v in values), + result + )) + elif df[ind].dtype in ('float64', 'int64', 'Float64', 'Int64'): + if ind in ordinal: + # Mann-Whitney test + result = mannwhitney(df, ind, dep, nan_policy='raise') + + result_labels.append(( + '{}, median (IQR)'.format(ind), + '{}, median (IQR)'.format(ind), + )) + result_data.append(( + '{:.2f} ({})'.format(result.med1, result.iqr1.summary()), + '{:.2f} ({})'.format(result.med2, result.iqr2.summary()), + result + )) + else: + # t test + result = ttest_ind(df, ind, dep, nan_policy='raise') + + result_labels.append(( + '{}, μ (SD)'.format(ind), + '{}, μ (SD)'.format(ind), + )) + result_data.append(( + '{:.2f} ({:.2f})'.format(result.mu1, result.sd1), + '{:.2f} ({:.2f})'.format(result.mu2, result.sd2), + result + )) + else: + raise Exception('Unsupported independent dtype for auto_univariable, {}'.format(df[ind].dtype)) + + return AutoBinaryResult(dep=dep, group1=group1, group2=group2, result_data=result_data, result_labels=result_labels)