Add auto_univariable
This commit is contained in:
parent
ce4df2eac1
commit
e5833796af
@ -6,6 +6,8 @@ Functions
|
|||||||
|
|
||||||
.. autofunction:: yli.anova_oneway
|
.. autofunction:: yli.anova_oneway
|
||||||
|
|
||||||
|
.. autofunction:: yli.auto_univariable
|
||||||
|
|
||||||
.. autofunction:: yli.chi2
|
.. autofunction:: yli.chi2
|
||||||
|
|
||||||
.. autofunction:: yli.mannwhitney
|
.. autofunction:: yli.mannwhitney
|
||||||
@ -17,6 +19,9 @@ Functions
|
|||||||
Result classes
|
Result classes
|
||||||
--------------
|
--------------
|
||||||
|
|
||||||
|
.. autoclass:: yli.sig_tests.AutoBinaryResult
|
||||||
|
:members:
|
||||||
|
|
||||||
.. autoclass:: yli.sig_tests.BrunnerMunzelResult
|
.. autoclass:: yli.sig_tests.BrunnerMunzelResult
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
@ -19,7 +19,7 @@ from .config import config
|
|||||||
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
|
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
|
||||||
from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
|
from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
|
||||||
from .regress import PenalisedLogit, logit_then_regress, regress, vif
|
from .regress import PenalisedLogit, logit_then_regress, regress, vif
|
||||||
from .sig_tests import anova_oneway, chi2, mannwhitney, pearsonr, ttest_ind
|
from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, ttest_ind
|
||||||
|
|
||||||
def reload_me():
|
def reload_me():
|
||||||
import importlib
|
import importlib
|
||||||
|
179
yli/sig_tests.py
179
yli/sig_tests.py
@ -99,6 +99,18 @@ class TTestResult:
|
|||||||
|
|
||||||
return '{}\n\nt({:.0f}) = {:.2f}; p {}\nΔμ ({:g}% CI) = {}, {}'.format(self._comparison_table(False), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION), (1-config.alpha)*100, self.delta.summary(), self.delta_direction)
|
return '{}\n\nt({:.0f}) = {:.2f}; p {}\nΔμ ({:g}% CI) = {}, {}'.format(self._comparison_table(False), self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION), (1-config.alpha)*100, self.delta.summary(), self.delta_direction)
|
||||||
|
|
||||||
|
def summary_short(self, html):
|
||||||
|
"""
|
||||||
|
Return a stringified summary of the *t* test (*t* statistic only)
|
||||||
|
|
||||||
|
:rtype: str
|
||||||
|
"""
|
||||||
|
|
||||||
|
if html:
|
||||||
|
return '<i>t</i>({:.0f}) = {:.2f}'.format(self.dof, self.statistic)
|
||||||
|
else:
|
||||||
|
return 't({:.0f}) = {:.2f}'.format(self.dof, self.statistic)
|
||||||
|
|
||||||
def ttest_ind(df, dep, ind, *, nan_policy='warn'):
|
def ttest_ind(df, dep, ind, *, nan_policy='warn'):
|
||||||
"""
|
"""
|
||||||
Perform an independent 2-sample Student's *t* test
|
Perform an independent 2-sample Student's *t* test
|
||||||
@ -337,6 +349,18 @@ class MannWhitneyResult:
|
|||||||
else:
|
else:
|
||||||
return line1
|
return line1
|
||||||
|
|
||||||
|
def summary_short(self, html):
|
||||||
|
"""
|
||||||
|
Return a stringified summary of the Mann–Whitney test (*U* statistic only)
|
||||||
|
|
||||||
|
:rtype: str
|
||||||
|
"""
|
||||||
|
|
||||||
|
if html:
|
||||||
|
return '<i>U</i> = {:.1f}'.format(self.statistic)
|
||||||
|
else:
|
||||||
|
return 'U = {:.1f}'.format(self.statistic)
|
||||||
|
|
||||||
class BrunnerMunzelResult:
|
class BrunnerMunzelResult:
|
||||||
"""
|
"""
|
||||||
Result of a Brunner–Munzel test
|
Result of a Brunner–Munzel test
|
||||||
@ -422,7 +446,7 @@ def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_cont
|
|||||||
# Check for/clean NaNs
|
# Check for/clean NaNs
|
||||||
df = check_nan(df[[ind, dep]], nan_policy)
|
df = check_nan(df[[ind, dep]], nan_policy)
|
||||||
|
|
||||||
# Convert pandas nullable types for independent variables as this breaks statsmodels
|
# Convert pandas nullable types for independent variables as this breaks mannwhitneyu
|
||||||
df = convert_pandas_nullable(df)
|
df = convert_pandas_nullable(df)
|
||||||
|
|
||||||
# Ensure 2 groups for ind
|
# Ensure 2 groups for ind
|
||||||
@ -507,6 +531,18 @@ class PearsonChiSquaredResult:
|
|||||||
return '{}\n\nχ²({}) = {:.2f}; p {}'.format(
|
return '{}\n\nχ²({}) = {:.2f}; p {}'.format(
|
||||||
self.ct, self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION))
|
self.ct, self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION))
|
||||||
|
|
||||||
|
def summary_short(self, html):
|
||||||
|
"""
|
||||||
|
Return a stringified summary of the *χ*:sup:`2` test (*χ*:sup:`2` statistic only)
|
||||||
|
|
||||||
|
:rtype: str
|
||||||
|
"""
|
||||||
|
|
||||||
|
if html:
|
||||||
|
return '<i>χ</i><sup>2</sup>({}) = {:.2f}'.format(self.dof, self.statistic)
|
||||||
|
else:
|
||||||
|
return 'χ²({}) = {:.2f}'.format(self.dof, self.statistic)
|
||||||
|
|
||||||
def chi2(df, dep, ind, *, nan_policy='warn'):
|
def chi2(df, dep, ind, *, nan_policy='warn'):
|
||||||
"""
|
"""
|
||||||
Perform a Pearson *χ*:sup:`2` test
|
Perform a Pearson *χ*:sup:`2` test
|
||||||
@ -662,3 +698,144 @@ def pearsonr(df, dep, ind, *, nan_policy='warn'):
|
|||||||
ci = result.confidence_interval()
|
ci = result.confidence_interval()
|
||||||
|
|
||||||
return PearsonRResult(statistic=Estimate(result.statistic, ci.low, ci.high), pvalue=result.pvalue)
|
return PearsonRResult(statistic=Estimate(result.statistic, ci.low, ci.high), pvalue=result.pvalue)
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Automatic selection of tests
|
||||||
|
|
||||||
|
class AutoBinaryResult:
|
||||||
|
"""
|
||||||
|
Result of automatically computed univariable tests of association for a dichotomous dependent variable
|
||||||
|
|
||||||
|
See :func:`yli.auto_univariable`.
|
||||||
|
|
||||||
|
Results data stored within instances of this class is not intended to be directly accessed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, *, dep, group1, group2, result_data, result_labels):
|
||||||
|
#: Name of the dependent variable (*str*)
|
||||||
|
self.dep = dep
|
||||||
|
|
||||||
|
#: Name of the first group (*str*)
|
||||||
|
self.group1 = group1
|
||||||
|
#: Name of the second group (*str*)
|
||||||
|
self.group2 = group2
|
||||||
|
|
||||||
|
# List of tuples (first group summary, second group summary, test result)
|
||||||
|
self._result_data = result_data
|
||||||
|
# List of row labels for the independente variables
|
||||||
|
self._result_labels = result_labels
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
if config.repr_is_summary:
|
||||||
|
return self.summary()
|
||||||
|
return super().__repr__()
|
||||||
|
|
||||||
|
def _repr_html_(self):
|
||||||
|
result = '<table><thead><tr><th>{}</th><th>{}</th><th>{}</th><th></th><th style="text-align:center"><i>p</i></th></tr></thead><tbody>'.format(self.dep, self.group1, self.group2)
|
||||||
|
|
||||||
|
for data, label in zip(self._result_data, self._result_labels):
|
||||||
|
result += '<tr><th>{}</th><td>{}</td><td>{}</td><td style="text-align:left">{}</td><td style="text-align:left">{}</td></tr>'.format(label[1], data[0], data[1], data[2].summary_short(True), fmt_p(data[2].pvalue, PValueStyle.TABULAR | PValueStyle.HTML))
|
||||||
|
|
||||||
|
result += '</tbody></table>'
|
||||||
|
return result
|
||||||
|
|
||||||
|
def summary(self):
|
||||||
|
"""
|
||||||
|
Return a stringified summary of the tests of association
|
||||||
|
|
||||||
|
:rtype: str
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Format data for output
|
||||||
|
result_data_fmt = [
|
||||||
|
[r[0], r[1], r[2].summary_short(False), fmt_p(r[2].pvalue, PValueStyle.TABULAR)]
|
||||||
|
for r in self._result_data
|
||||||
|
]
|
||||||
|
result_labels_fmt = [r[0] for r in self._result_labels]
|
||||||
|
|
||||||
|
table = pd.DataFrame(result_data_fmt, index=result_labels_fmt, columns=pd.Index([self.group1, self.group2, '', 'p'], name=self.dep))
|
||||||
|
return str(table)
|
||||||
|
|
||||||
|
def auto_univariable(df, dep, inds, *, ordinal=[], nan_policy='warn'):
|
||||||
|
"""
|
||||||
|
Automatically compute univariable tests of association for a dichotomous dependent variable
|
||||||
|
|
||||||
|
The tests performed are:
|
||||||
|
|
||||||
|
* For a dichotomous independent variable – :func:`yli.chi2`
|
||||||
|
* For a continuous independent variable – :func:`yli.ttest_ind`
|
||||||
|
* For an ordinal independent variable – :func:`yli.mannwhitney`
|
||||||
|
|
||||||
|
:param df: Data to perform the test on
|
||||||
|
:type df: DataFrame
|
||||||
|
:param dep: Column in *df* for the dependent variable (dichotomous)
|
||||||
|
:type dep: str
|
||||||
|
:param inds: Columns in *df* for the independent variables
|
||||||
|
:type inds: List[str]
|
||||||
|
:param ordinal: Columns in *df* to treat as ordinal rather than continuous
|
||||||
|
:type ordinal: List[str]
|
||||||
|
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
|
||||||
|
:type nan_policy: str
|
||||||
|
|
||||||
|
:rtype: :class:`yli.sig_tests.AutoBinaryResult`
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Check for/clean NaNs
|
||||||
|
# Following this, we pass nan_policy='raise' to assert no NaNs remaining
|
||||||
|
df = check_nan(df[inds + [dep]], nan_policy)
|
||||||
|
|
||||||
|
# Ensure 2 groups for dep
|
||||||
|
# TODO: Work for non-binary dependent variables?
|
||||||
|
group1, data1, group2, data2 = as_2groups(df, inds, dep)
|
||||||
|
|
||||||
|
result_data = []
|
||||||
|
result_labels = []
|
||||||
|
|
||||||
|
for ind in inds:
|
||||||
|
if df[ind].dtype in ('bool', 'category', 'object'):
|
||||||
|
# Pearson chi-squared test
|
||||||
|
result = chi2(df, dep, ind, nan_policy='raise')
|
||||||
|
|
||||||
|
values = sorted(df[ind].unique())
|
||||||
|
|
||||||
|
# Value counts
|
||||||
|
result_labels.append((
|
||||||
|
'{}, {}'.format(ind, ':'.join(str(v) for v in values)),
|
||||||
|
'{}, {}'.format(ind, ':'.join(str(v) for v in values)),
|
||||||
|
))
|
||||||
|
result_data.append((
|
||||||
|
':'.join(str((data1[ind] == v).sum()) for v in values),
|
||||||
|
':'.join(str((data2[ind] == v).sum()) for v in values),
|
||||||
|
result
|
||||||
|
))
|
||||||
|
elif df[ind].dtype in ('float64', 'int64', 'Float64', 'Int64'):
|
||||||
|
if ind in ordinal:
|
||||||
|
# Mann-Whitney test
|
||||||
|
result = mannwhitney(df, ind, dep, nan_policy='raise')
|
||||||
|
|
||||||
|
result_labels.append((
|
||||||
|
'{}, median (IQR)'.format(ind),
|
||||||
|
'{}, median (IQR)'.format(ind),
|
||||||
|
))
|
||||||
|
result_data.append((
|
||||||
|
'{:.2f} ({})'.format(result.med1, result.iqr1.summary()),
|
||||||
|
'{:.2f} ({})'.format(result.med2, result.iqr2.summary()),
|
||||||
|
result
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
# t test
|
||||||
|
result = ttest_ind(df, ind, dep, nan_policy='raise')
|
||||||
|
|
||||||
|
result_labels.append((
|
||||||
|
'{}, μ (SD)'.format(ind),
|
||||||
|
'{}, <i>μ</i> (SD)'.format(ind),
|
||||||
|
))
|
||||||
|
result_data.append((
|
||||||
|
'{:.2f} ({:.2f})'.format(result.mu1, result.sd1),
|
||||||
|
'{:.2f} ({:.2f})'.format(result.mu2, result.sd2),
|
||||||
|
result
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
raise Exception('Unsupported independent dtype for auto_univariable, {}'.format(df[ind].dtype))
|
||||||
|
|
||||||
|
return AutoBinaryResult(dep=dep, group1=group1, group2=group2, result_data=result_data, result_labels=result_labels)
|
||||||
|
Loading…
Reference in New Issue
Block a user