Initial implementation of Brant test
This commit is contained in:
parent
0dab62ad0a
commit
fc8303678f
@ -19,7 +19,7 @@ from .config import config
|
||||
from .descriptives import auto_descriptives
|
||||
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
|
||||
from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
|
||||
from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, regress_bootstrap, vif
|
||||
from .regress import OrdinalLogit, PenalisedLogit, brant, logit_then_regress, regress, regress_bootstrap, vif
|
||||
from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, ttest_ind
|
||||
|
||||
def reload_me():
|
||||
|
165
yli/regress.py
165
yli/regress.py
@ -31,7 +31,7 @@ import warnings
|
||||
|
||||
from .bayes_factors import BayesFactor, bayesfactor_afbf
|
||||
from .config import config
|
||||
from .sig_tests import FTestResult
|
||||
from .sig_tests import ChiSquaredResult, FTestResult
|
||||
from .utils import Estimate, PValueStyle, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term
|
||||
|
||||
def vif(df, formula=None, *, nan_policy='warn'):
|
||||
@ -94,7 +94,7 @@ def vif(df, formula=None, *, nan_policy='warn'):
|
||||
# ----------
|
||||
# Regression
|
||||
|
||||
class LikelihoodRatioTestResult:
|
||||
class LikelihoodRatioTestResult(ChiSquaredResult):
|
||||
"""
|
||||
Result of a likelihood ratio test for regression
|
||||
|
||||
@ -102,17 +102,7 @@ class LikelihoodRatioTestResult:
|
||||
"""
|
||||
|
||||
def __init__(self, statistic, dof, pvalue):
|
||||
#: Likelihood ratio test statistic (*float*)
|
||||
self.statistic = statistic
|
||||
#: Degrees of freedom for the likelihood ratio test statistic (*int*)
|
||||
self.dof = dof
|
||||
#: *p* value for the likelihood ratio test (*float*)
|
||||
self.pvalue = pvalue
|
||||
|
||||
def __repr__(self):
|
||||
if config.repr_is_summary:
|
||||
return self.summary()
|
||||
return super().__repr__()
|
||||
super().__init__(statistic, dof, pvalue)
|
||||
|
||||
def _repr_html_(self):
|
||||
return 'LR({}) = {:.2f}; <i>p</i> {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
|
||||
@ -913,3 +903,152 @@ class OrdinalLogit(statsmodels.miscmodels.ordinal_model.OrderedModel):
|
||||
|
||||
def transform_reverse_threshold_params(self, params):
|
||||
return params[:-1]
|
||||
|
||||
class _Dummy: pass
|
||||
|
||||
def _wald_test(param_names, params, formula, vcov, df):
|
||||
# Hack! Piggyback off statsmodels to compute a Wald test
|
||||
|
||||
lmr = statsmodels.base.model.LikelihoodModelResults(model=None, params=None)
|
||||
lmr.model = _Dummy()
|
||||
lmr.model.data = _Dummy()
|
||||
lmr.model.data.cov_names = param_names
|
||||
lmr.params = params
|
||||
lmr.df_resid = df
|
||||
|
||||
return lmr.wald_test(formula, cov_p=vcov, use_f=False, scalar=True)
|
||||
|
||||
class BrantResult:
|
||||
# TODO: Documentation
|
||||
|
||||
def __init__(self, tests):
|
||||
#: Results for Brant test on each coefficient (*Dict[str, ChiSquaredResult]*)
|
||||
self.tests = tests
|
||||
|
||||
def __repr__(self):
|
||||
if config.repr_is_summary:
|
||||
return self.summary()
|
||||
return super().__repr__()
|
||||
|
||||
def _repr_html_(self):
|
||||
out = '<table><caption>Brant Test Results</caption><thead><tr><th></th><th style="text-align:center"><i>χ</i><sup>2</sup></th><th style="text-align:center">df</th><th style="text-align:center"><i>p</i></th></thead><tbody>'
|
||||
|
||||
for raw_name, test in self.tests.items():
|
||||
out += '<tr><th>{}</th><td>{:.2f}</td><td>{:.0f}</td><td style="text-align:left">{}</td></tr>'.format(raw_name, test.statistic, test.dof, fmt_p(test.pvalue, PValueStyle.TABULAR | PValueStyle.HTML))
|
||||
|
||||
out += '</tbody></table>'
|
||||
return out
|
||||
|
||||
def summary(self):
|
||||
"""
|
||||
Return a stringified summary of the *χ*:sup:`2` test
|
||||
|
||||
:rtype: str
|
||||
"""
|
||||
|
||||
# FIXME
|
||||
return 'FIXME'
|
||||
|
||||
def brant(
|
||||
df, dep, formula, *,
|
||||
nan_policy='warn',
|
||||
fit_kwargs=None,
|
||||
method=None, maxiter=None, start_params=None, # common fit_kwargs
|
||||
):
|
||||
# TODO: Documentation
|
||||
|
||||
if fit_kwargs is None:
|
||||
fit_kwargs = {}
|
||||
if method is not None:
|
||||
fit_kwargs['method'] = method
|
||||
if maxiter is not None:
|
||||
fit_kwargs['maxiter'] = maxiter
|
||||
if start_params is not None:
|
||||
fit_kwargs['start_params'] = start_params
|
||||
|
||||
# Check for/clean NaNs
|
||||
# Following this, we pass nan_policy='raise' to assert no NaNs remaining
|
||||
df = df[[dep] + cols_for_formula(formula, df)]
|
||||
df = check_nan(df, nan_policy)
|
||||
|
||||
# Ensure numeric type for dependent variable
|
||||
if df[dep].dtype != 'float64':
|
||||
df[dep] = df[dep].astype('float64')
|
||||
|
||||
# Convert pandas nullable types for independent variables as this breaks statsmodels
|
||||
df = convert_pandas_nullable(df)
|
||||
|
||||
# Precompute design matrix for RHS
|
||||
# This is also X+ in Brant paper
|
||||
dmatrix_right = patsy.dmatrix(formula, df, return_type='dataframe')
|
||||
dmatrix_right.reset_index(drop=True, inplace=True) # Otherwise this confuses matrix multiplication
|
||||
|
||||
# Fit individual logistic regressions
|
||||
logit_models = []
|
||||
for upper_limit in sorted(df[dep].unique())[:-1]: # FIXME: Sort order
|
||||
dep_dichotomous = (df[dep] <= upper_limit).astype(int).reset_index(drop=True)
|
||||
logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False, **fit_kwargs)
|
||||
|
||||
if not logit_result.mle_retvals['converged']:
|
||||
raise Exception('Maximum likelihood estimation failed to converge for {} <= {}. Check raw_result.mle_retvals.'.format(dep, upper_limit))
|
||||
|
||||
if pd.isna(logit_result.bse).any():
|
||||
raise Exception('Regression returned NaN standard errors for {} <= {}.'.format(dep, upper_limit))
|
||||
|
||||
logit_models.append(logit_result)
|
||||
|
||||
logit_betas = np.array([model._results.params for model in logit_models]).T
|
||||
logit_pihat = np.array([expit(-model.fittedvalues) for model in logit_models]).T # Predicted probabilities
|
||||
|
||||
# vcov is the variance-covariance matrix of all individually fitted betas across all terms
|
||||
|
||||
# | model 1 | model 2 | model 3 | ...
|
||||
# | term 1 | term 2 | term 1 | term 2 | term 1 | term 2 | ...
|
||||
# model 1 | term 1 |
|
||||
# | term 2 |
|
||||
# model 2 | term 1 |
|
||||
# | term 2 |
|
||||
# ...
|
||||
|
||||
n_terms = len(dmatrix_right.columns) - 1 # number of beta terms (excluding intercept)
|
||||
n_betas = len(logit_models) * n_terms
|
||||
vcov = np.zeros((n_betas, n_betas))
|
||||
|
||||
# Populate the variance-covariance matrix for comparisons between individually fitted models
|
||||
for j in range(0, len(logit_models) - 1):
|
||||
for l in range(j + 1, len(logit_models)):
|
||||
Wjj = np.diag(logit_pihat[:,j] - logit_pihat[:,j] * logit_pihat[:,j])
|
||||
Wjl = np.diag(logit_pihat[:,l] - logit_pihat[:,j] * logit_pihat[:,l])
|
||||
Wll = np.diag(logit_pihat[:,l] - logit_pihat[:,l] * logit_pihat[:,l])
|
||||
|
||||
matrix_result = np.linalg.inv(dmatrix_right.T @ Wjj @ dmatrix_right) @ dmatrix_right.T @ Wjl @ dmatrix_right @ np.linalg.inv(dmatrix_right.T @ Wll @ dmatrix_right)
|
||||
j_vs_l_vcov = np.asarray(matrix_result)[1:,1:] # Asymptotic covariance for j,l
|
||||
|
||||
vcov[j*n_terms:(j+1)*n_terms, l*n_terms:(l+1)*n_terms] = j_vs_l_vcov
|
||||
vcov[l*n_terms:(l+1)*n_terms, j*n_terms:(j+1)*n_terms] = j_vs_l_vcov
|
||||
|
||||
# Populate the variance-covariance matrix within each individually fitted model
|
||||
for i in range(len(logit_models)):
|
||||
vcov[i*n_terms:(i+1)*n_terms, i*n_terms:(i+1)*n_terms] = logit_models[i]._results.cov_params()[1:,1:]
|
||||
|
||||
# ------------------
|
||||
# Perform Wald tests
|
||||
|
||||
beta_names = ['{}_{}'.format(raw_name, i) for i in range(len(logit_models)) for raw_name in dmatrix_right.columns[1:]]
|
||||
wald_results = {}
|
||||
|
||||
# Omnibus test
|
||||
constraints = [' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models))) for raw_name in dmatrix_right.columns[1:]]
|
||||
constraint = ', '.join(constraints)
|
||||
df = (len(logit_models) - 1) * (len(dmatrix_right.columns) - 1) # df = (number of levels minus 2) * (number of terms excluding intercept)
|
||||
wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, df)
|
||||
wald_results['Omnibus'] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue)
|
||||
|
||||
# Individual terms
|
||||
for raw_name in dmatrix_right.columns[1:]:
|
||||
constraint = ' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models)))
|
||||
df = len(logit_models) - 1 # df = (number of levels minus 2)
|
||||
wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, df)
|
||||
wald_results[raw_name] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue)
|
||||
|
||||
return BrantResult(wald_results)
|
||||
|
@ -483,7 +483,35 @@ def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_cont
|
||||
# ------------------------
|
||||
# Pearson chi-squared test
|
||||
|
||||
class PearsonChiSquaredResult:
|
||||
class ChiSquaredResult:
|
||||
# TODO: Documentation
|
||||
|
||||
def __init__(self, statistic, dof, pvalue):
|
||||
#: *χ*:sup:`2` statistic (*float*)
|
||||
self.statistic = statistic
|
||||
#: Degrees of freedom for the *χ*:sup:`2` distribution (*int*)
|
||||
self.dof = dof
|
||||
#: *p* value for the *χ*:sup:`2` test (*float*)
|
||||
self.pvalue = pvalue
|
||||
|
||||
def __repr__(self):
|
||||
if config.repr_is_summary:
|
||||
return self.summary()
|
||||
return super().__repr__()
|
||||
|
||||
def _repr_html_(self):
|
||||
return '<i>χ</i><sup>2</sup>({}) = {:.2f}; <i>p</i> {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
|
||||
|
||||
def summary(self):
|
||||
"""
|
||||
Return a stringified summary of the *χ*:sup:`2` test
|
||||
|
||||
:rtype: str
|
||||
"""
|
||||
|
||||
return 'χ²({}) = {:.2f}; p {}'.format(self.ct, self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION))
|
||||
|
||||
class PearsonChiSquaredResult(ChiSquaredResult):
|
||||
"""
|
||||
Result of a Pearson *χ*:sup:`2` test
|
||||
|
||||
@ -491,24 +519,15 @@ class PearsonChiSquaredResult:
|
||||
"""
|
||||
|
||||
def __init__(self, ct, statistic, dof, pvalue, oddsratio=None, riskratio=None):
|
||||
super().__init__(statistic, dof, pvalue)
|
||||
|
||||
#: Contingency table for the observations (*DataFrame*)
|
||||
self.ct = ct
|
||||
#: *χ*:sup:`2` statistic (*float*)
|
||||
self.statistic = statistic
|
||||
#: Degrees of freedom for the *χ*:sup:`2` distribution (*int*)
|
||||
self.dof = dof
|
||||
#: *p* value for the *χ*:sup:`2` test (*float*)
|
||||
self.pvalue = pvalue
|
||||
#: Odds ratio (*float*; *None* if not a 2×2 table)
|
||||
self.oddsratio = oddsratio
|
||||
#: Risk ratio (*float*; *None* if not a 2×2 table)
|
||||
self.riskratio = riskratio
|
||||
|
||||
def __repr__(self):
|
||||
if config.repr_is_summary:
|
||||
return self.summary()
|
||||
return super().__repr__()
|
||||
|
||||
def _repr_html_(self):
|
||||
if self.oddsratio is not None:
|
||||
return '{0}<br><i>χ</i><sup>2</sup>({1}) = {2:.2f}; <i>p</i> {3}<br>OR ({4:g}% CI) = {5}<br>RR ({4:g}% CI) = {6}'.format(
|
||||
|
Loading…
Reference in New Issue
Block a user