Initial implementation of Brant test

This commit is contained in:
RunasSudo 2022-12-02 20:20:25 +11:00
parent 0dab62ad0a
commit fc8303678f
Signed by: RunasSudo
GPG Key ID: 7234E476BF21C61A
3 changed files with 184 additions and 26 deletions

View File

@ -19,7 +19,7 @@ from .config import config
from .descriptives import auto_descriptives from .descriptives import auto_descriptives
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, regress_bootstrap, vif from .regress import OrdinalLogit, PenalisedLogit, brant, logit_then_regress, regress, regress_bootstrap, vif
from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, ttest_ind from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, ttest_ind
def reload_me(): def reload_me():

View File

@ -31,7 +31,7 @@ import warnings
from .bayes_factors import BayesFactor, bayesfactor_afbf from .bayes_factors import BayesFactor, bayesfactor_afbf
from .config import config from .config import config
from .sig_tests import FTestResult from .sig_tests import ChiSquaredResult, FTestResult
from .utils import Estimate, PValueStyle, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term from .utils import Estimate, PValueStyle, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term
def vif(df, formula=None, *, nan_policy='warn'): def vif(df, formula=None, *, nan_policy='warn'):
@ -94,7 +94,7 @@ def vif(df, formula=None, *, nan_policy='warn'):
# ---------- # ----------
# Regression # Regression
class LikelihoodRatioTestResult: class LikelihoodRatioTestResult(ChiSquaredResult):
""" """
Result of a likelihood ratio test for regression Result of a likelihood ratio test for regression
@ -102,17 +102,7 @@ class LikelihoodRatioTestResult:
""" """
def __init__(self, statistic, dof, pvalue): def __init__(self, statistic, dof, pvalue):
#: Likelihood ratio test statistic (*float*) super().__init__(statistic, dof, pvalue)
self.statistic = statistic
#: Degrees of freedom for the likelihood ratio test statistic (*int*)
self.dof = dof
#: *p* value for the likelihood ratio test (*float*)
self.pvalue = pvalue
def __repr__(self):
if config.repr_is_summary:
return self.summary()
return super().__repr__()
def _repr_html_(self): def _repr_html_(self):
return 'LR({}) = {:.2f}; <i>p</i> {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML)) return 'LR({}) = {:.2f}; <i>p</i> {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
@ -913,3 +903,152 @@ class OrdinalLogit(statsmodels.miscmodels.ordinal_model.OrderedModel):
def transform_reverse_threshold_params(self, params): def transform_reverse_threshold_params(self, params):
return params[:-1] return params[:-1]
class _Dummy: pass
def _wald_test(param_names, params, formula, vcov, df):
# Hack! Piggyback off statsmodels to compute a Wald test
lmr = statsmodels.base.model.LikelihoodModelResults(model=None, params=None)
lmr.model = _Dummy()
lmr.model.data = _Dummy()
lmr.model.data.cov_names = param_names
lmr.params = params
lmr.df_resid = df
return lmr.wald_test(formula, cov_p=vcov, use_f=False, scalar=True)
class BrantResult:
# TODO: Documentation
def __init__(self, tests):
#: Results for Brant test on each coefficient (*Dict[str, ChiSquaredResult]*)
self.tests = tests
def __repr__(self):
if config.repr_is_summary:
return self.summary()
return super().__repr__()
def _repr_html_(self):
out = '<table><caption>Brant Test Results</caption><thead><tr><th></th><th style="text-align:center"><i>χ</i><sup>2</sup></th><th style="text-align:center">df</th><th style="text-align:center"><i>p</i></th></thead><tbody>'
for raw_name, test in self.tests.items():
out += '<tr><th>{}</th><td>{:.2f}</td><td>{:.0f}</td><td style="text-align:left">{}</td></tr>'.format(raw_name, test.statistic, test.dof, fmt_p(test.pvalue, PValueStyle.TABULAR | PValueStyle.HTML))
out += '</tbody></table>'
return out
def summary(self):
"""
Return a stringified summary of the *χ*:sup:`2` test
:rtype: str
"""
# FIXME
return 'FIXME'
def brant(
df, dep, formula, *,
nan_policy='warn',
fit_kwargs=None,
method=None, maxiter=None, start_params=None, # common fit_kwargs
):
# TODO: Documentation
if fit_kwargs is None:
fit_kwargs = {}
if method is not None:
fit_kwargs['method'] = method
if maxiter is not None:
fit_kwargs['maxiter'] = maxiter
if start_params is not None:
fit_kwargs['start_params'] = start_params
# Check for/clean NaNs
# Following this, we pass nan_policy='raise' to assert no NaNs remaining
df = df[[dep] + cols_for_formula(formula, df)]
df = check_nan(df, nan_policy)
# Ensure numeric type for dependent variable
if df[dep].dtype != 'float64':
df[dep] = df[dep].astype('float64')
# Convert pandas nullable types for independent variables as this breaks statsmodels
df = convert_pandas_nullable(df)
# Precompute design matrix for RHS
# This is also X+ in Brant paper
dmatrix_right = patsy.dmatrix(formula, df, return_type='dataframe')
dmatrix_right.reset_index(drop=True, inplace=True) # Otherwise this confuses matrix multiplication
# Fit individual logistic regressions
logit_models = []
for upper_limit in sorted(df[dep].unique())[:-1]: # FIXME: Sort order
dep_dichotomous = (df[dep] <= upper_limit).astype(int).reset_index(drop=True)
logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False, **fit_kwargs)
if not logit_result.mle_retvals['converged']:
raise Exception('Maximum likelihood estimation failed to converge for {} <= {}. Check raw_result.mle_retvals.'.format(dep, upper_limit))
if pd.isna(logit_result.bse).any():
raise Exception('Regression returned NaN standard errors for {} <= {}.'.format(dep, upper_limit))
logit_models.append(logit_result)
logit_betas = np.array([model._results.params for model in logit_models]).T
logit_pihat = np.array([expit(-model.fittedvalues) for model in logit_models]).T # Predicted probabilities
# vcov is the variance-covariance matrix of all individually fitted betas across all terms
# | model 1 | model 2 | model 3 | ...
# | term 1 | term 2 | term 1 | term 2 | term 1 | term 2 | ...
# model 1 | term 1 |
# | term 2 |
# model 2 | term 1 |
# | term 2 |
# ...
n_terms = len(dmatrix_right.columns) - 1 # number of beta terms (excluding intercept)
n_betas = len(logit_models) * n_terms
vcov = np.zeros((n_betas, n_betas))
# Populate the variance-covariance matrix for comparisons between individually fitted models
for j in range(0, len(logit_models) - 1):
for l in range(j + 1, len(logit_models)):
Wjj = np.diag(logit_pihat[:,j] - logit_pihat[:,j] * logit_pihat[:,j])
Wjl = np.diag(logit_pihat[:,l] - logit_pihat[:,j] * logit_pihat[:,l])
Wll = np.diag(logit_pihat[:,l] - logit_pihat[:,l] * logit_pihat[:,l])
matrix_result = np.linalg.inv(dmatrix_right.T @ Wjj @ dmatrix_right) @ dmatrix_right.T @ Wjl @ dmatrix_right @ np.linalg.inv(dmatrix_right.T @ Wll @ dmatrix_right)
j_vs_l_vcov = np.asarray(matrix_result)[1:,1:] # Asymptotic covariance for j,l
vcov[j*n_terms:(j+1)*n_terms, l*n_terms:(l+1)*n_terms] = j_vs_l_vcov
vcov[l*n_terms:(l+1)*n_terms, j*n_terms:(j+1)*n_terms] = j_vs_l_vcov
# Populate the variance-covariance matrix within each individually fitted model
for i in range(len(logit_models)):
vcov[i*n_terms:(i+1)*n_terms, i*n_terms:(i+1)*n_terms] = logit_models[i]._results.cov_params()[1:,1:]
# ------------------
# Perform Wald tests
beta_names = ['{}_{}'.format(raw_name, i) for i in range(len(logit_models)) for raw_name in dmatrix_right.columns[1:]]
wald_results = {}
# Omnibus test
constraints = [' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models))) for raw_name in dmatrix_right.columns[1:]]
constraint = ', '.join(constraints)
df = (len(logit_models) - 1) * (len(dmatrix_right.columns) - 1) # df = (number of levels minus 2) * (number of terms excluding intercept)
wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, df)
wald_results['Omnibus'] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue)
# Individual terms
for raw_name in dmatrix_right.columns[1:]:
constraint = ' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models)))
df = len(logit_models) - 1 # df = (number of levels minus 2)
wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, df)
wald_results[raw_name] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue)
return BrantResult(wald_results)

View File

@ -483,7 +483,35 @@ def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_cont
# ------------------------ # ------------------------
# Pearson chi-squared test # Pearson chi-squared test
class PearsonChiSquaredResult: class ChiSquaredResult:
# TODO: Documentation
def __init__(self, statistic, dof, pvalue):
#: *χ*:sup:`2` statistic (*float*)
self.statistic = statistic
#: Degrees of freedom for the *χ*:sup:`2` distribution (*int*)
self.dof = dof
#: *p* value for the *χ*:sup:`2` test (*float*)
self.pvalue = pvalue
def __repr__(self):
if config.repr_is_summary:
return self.summary()
return super().__repr__()
def _repr_html_(self):
return '<i>χ</i><sup>2</sup>({}) = {:.2f}; <i>p</i> {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
def summary(self):
"""
Return a stringified summary of the *χ*:sup:`2` test
:rtype: str
"""
return 'χ²({}) = {:.2f}; p {}'.format(self.ct, self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION))
class PearsonChiSquaredResult(ChiSquaredResult):
""" """
Result of a Pearson *χ*:sup:`2` test Result of a Pearson *χ*:sup:`2` test
@ -491,24 +519,15 @@ class PearsonChiSquaredResult:
""" """
def __init__(self, ct, statistic, dof, pvalue, oddsratio=None, riskratio=None): def __init__(self, ct, statistic, dof, pvalue, oddsratio=None, riskratio=None):
super().__init__(statistic, dof, pvalue)
#: Contingency table for the observations (*DataFrame*) #: Contingency table for the observations (*DataFrame*)
self.ct = ct self.ct = ct
#: *χ*:sup:`2` statistic (*float*)
self.statistic = statistic
#: Degrees of freedom for the *χ*:sup:`2` distribution (*int*)
self.dof = dof
#: *p* value for the *χ*:sup:`2` test (*float*)
self.pvalue = pvalue
#: Odds ratio (*float*; *None* if not a 2×2 table) #: Odds ratio (*float*; *None* if not a 2×2 table)
self.oddsratio = oddsratio self.oddsratio = oddsratio
#: Risk ratio (*float*; *None* if not a 2×2 table) #: Risk ratio (*float*; *None* if not a 2×2 table)
self.riskratio = riskratio self.riskratio = riskratio
def __repr__(self):
if config.repr_is_summary:
return self.summary()
return super().__repr__()
def _repr_html_(self): def _repr_html_(self):
if self.oddsratio is not None: if self.oddsratio is not None:
return '{0}<br><i>χ</i><sup>2</sup>({1}) = {2:.2f}; <i>p</i> {3}<br>OR ({4:g}% CI) = {5}<br>RR ({4:g}% CI) = {6}'.format( return '{0}<br><i>χ</i><sup>2</sup>({1}) = {2:.2f}; <i>p</i> {3}<br>OR ({4:g}% CI) = {5}<br>RR ({4:g}% CI) = {6}'.format(