Initial implementation of Brant test
This commit is contained in:
parent
0dab62ad0a
commit
fc8303678f
@ -19,7 +19,7 @@ from .config import config
|
|||||||
from .descriptives import auto_descriptives
|
from .descriptives import auto_descriptives
|
||||||
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
|
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
|
||||||
from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
|
from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
|
||||||
from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, regress_bootstrap, vif
|
from .regress import OrdinalLogit, PenalisedLogit, brant, logit_then_regress, regress, regress_bootstrap, vif
|
||||||
from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, ttest_ind
|
from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, ttest_ind
|
||||||
|
|
||||||
def reload_me():
|
def reload_me():
|
||||||
|
165
yli/regress.py
165
yli/regress.py
@ -31,7 +31,7 @@ import warnings
|
|||||||
|
|
||||||
from .bayes_factors import BayesFactor, bayesfactor_afbf
|
from .bayes_factors import BayesFactor, bayesfactor_afbf
|
||||||
from .config import config
|
from .config import config
|
||||||
from .sig_tests import FTestResult
|
from .sig_tests import ChiSquaredResult, FTestResult
|
||||||
from .utils import Estimate, PValueStyle, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term
|
from .utils import Estimate, PValueStyle, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term
|
||||||
|
|
||||||
def vif(df, formula=None, *, nan_policy='warn'):
|
def vif(df, formula=None, *, nan_policy='warn'):
|
||||||
@ -94,7 +94,7 @@ def vif(df, formula=None, *, nan_policy='warn'):
|
|||||||
# ----------
|
# ----------
|
||||||
# Regression
|
# Regression
|
||||||
|
|
||||||
class LikelihoodRatioTestResult:
|
class LikelihoodRatioTestResult(ChiSquaredResult):
|
||||||
"""
|
"""
|
||||||
Result of a likelihood ratio test for regression
|
Result of a likelihood ratio test for regression
|
||||||
|
|
||||||
@ -102,17 +102,7 @@ class LikelihoodRatioTestResult:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, statistic, dof, pvalue):
|
def __init__(self, statistic, dof, pvalue):
|
||||||
#: Likelihood ratio test statistic (*float*)
|
super().__init__(statistic, dof, pvalue)
|
||||||
self.statistic = statistic
|
|
||||||
#: Degrees of freedom for the likelihood ratio test statistic (*int*)
|
|
||||||
self.dof = dof
|
|
||||||
#: *p* value for the likelihood ratio test (*float*)
|
|
||||||
self.pvalue = pvalue
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
if config.repr_is_summary:
|
|
||||||
return self.summary()
|
|
||||||
return super().__repr__()
|
|
||||||
|
|
||||||
def _repr_html_(self):
|
def _repr_html_(self):
|
||||||
return 'LR({}) = {:.2f}; <i>p</i> {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
|
return 'LR({}) = {:.2f}; <i>p</i> {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
|
||||||
@ -913,3 +903,152 @@ class OrdinalLogit(statsmodels.miscmodels.ordinal_model.OrderedModel):
|
|||||||
|
|
||||||
def transform_reverse_threshold_params(self, params):
|
def transform_reverse_threshold_params(self, params):
|
||||||
return params[:-1]
|
return params[:-1]
|
||||||
|
|
||||||
|
class _Dummy: pass
|
||||||
|
|
||||||
|
def _wald_test(param_names, params, formula, vcov, df):
|
||||||
|
# Hack! Piggyback off statsmodels to compute a Wald test
|
||||||
|
|
||||||
|
lmr = statsmodels.base.model.LikelihoodModelResults(model=None, params=None)
|
||||||
|
lmr.model = _Dummy()
|
||||||
|
lmr.model.data = _Dummy()
|
||||||
|
lmr.model.data.cov_names = param_names
|
||||||
|
lmr.params = params
|
||||||
|
lmr.df_resid = df
|
||||||
|
|
||||||
|
return lmr.wald_test(formula, cov_p=vcov, use_f=False, scalar=True)
|
||||||
|
|
||||||
|
class BrantResult:
|
||||||
|
# TODO: Documentation
|
||||||
|
|
||||||
|
def __init__(self, tests):
|
||||||
|
#: Results for Brant test on each coefficient (*Dict[str, ChiSquaredResult]*)
|
||||||
|
self.tests = tests
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
if config.repr_is_summary:
|
||||||
|
return self.summary()
|
||||||
|
return super().__repr__()
|
||||||
|
|
||||||
|
def _repr_html_(self):
|
||||||
|
out = '<table><caption>Brant Test Results</caption><thead><tr><th></th><th style="text-align:center"><i>χ</i><sup>2</sup></th><th style="text-align:center">df</th><th style="text-align:center"><i>p</i></th></thead><tbody>'
|
||||||
|
|
||||||
|
for raw_name, test in self.tests.items():
|
||||||
|
out += '<tr><th>{}</th><td>{:.2f}</td><td>{:.0f}</td><td style="text-align:left">{}</td></tr>'.format(raw_name, test.statistic, test.dof, fmt_p(test.pvalue, PValueStyle.TABULAR | PValueStyle.HTML))
|
||||||
|
|
||||||
|
out += '</tbody></table>'
|
||||||
|
return out
|
||||||
|
|
||||||
|
def summary(self):
|
||||||
|
"""
|
||||||
|
Return a stringified summary of the *χ*:sup:`2` test
|
||||||
|
|
||||||
|
:rtype: str
|
||||||
|
"""
|
||||||
|
|
||||||
|
# FIXME
|
||||||
|
return 'FIXME'
|
||||||
|
|
||||||
|
def brant(
|
||||||
|
df, dep, formula, *,
|
||||||
|
nan_policy='warn',
|
||||||
|
fit_kwargs=None,
|
||||||
|
method=None, maxiter=None, start_params=None, # common fit_kwargs
|
||||||
|
):
|
||||||
|
# TODO: Documentation
|
||||||
|
|
||||||
|
if fit_kwargs is None:
|
||||||
|
fit_kwargs = {}
|
||||||
|
if method is not None:
|
||||||
|
fit_kwargs['method'] = method
|
||||||
|
if maxiter is not None:
|
||||||
|
fit_kwargs['maxiter'] = maxiter
|
||||||
|
if start_params is not None:
|
||||||
|
fit_kwargs['start_params'] = start_params
|
||||||
|
|
||||||
|
# Check for/clean NaNs
|
||||||
|
# Following this, we pass nan_policy='raise' to assert no NaNs remaining
|
||||||
|
df = df[[dep] + cols_for_formula(formula, df)]
|
||||||
|
df = check_nan(df, nan_policy)
|
||||||
|
|
||||||
|
# Ensure numeric type for dependent variable
|
||||||
|
if df[dep].dtype != 'float64':
|
||||||
|
df[dep] = df[dep].astype('float64')
|
||||||
|
|
||||||
|
# Convert pandas nullable types for independent variables as this breaks statsmodels
|
||||||
|
df = convert_pandas_nullable(df)
|
||||||
|
|
||||||
|
# Precompute design matrix for RHS
|
||||||
|
# This is also X+ in Brant paper
|
||||||
|
dmatrix_right = patsy.dmatrix(formula, df, return_type='dataframe')
|
||||||
|
dmatrix_right.reset_index(drop=True, inplace=True) # Otherwise this confuses matrix multiplication
|
||||||
|
|
||||||
|
# Fit individual logistic regressions
|
||||||
|
logit_models = []
|
||||||
|
for upper_limit in sorted(df[dep].unique())[:-1]: # FIXME: Sort order
|
||||||
|
dep_dichotomous = (df[dep] <= upper_limit).astype(int).reset_index(drop=True)
|
||||||
|
logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False, **fit_kwargs)
|
||||||
|
|
||||||
|
if not logit_result.mle_retvals['converged']:
|
||||||
|
raise Exception('Maximum likelihood estimation failed to converge for {} <= {}. Check raw_result.mle_retvals.'.format(dep, upper_limit))
|
||||||
|
|
||||||
|
if pd.isna(logit_result.bse).any():
|
||||||
|
raise Exception('Regression returned NaN standard errors for {} <= {}.'.format(dep, upper_limit))
|
||||||
|
|
||||||
|
logit_models.append(logit_result)
|
||||||
|
|
||||||
|
logit_betas = np.array([model._results.params for model in logit_models]).T
|
||||||
|
logit_pihat = np.array([expit(-model.fittedvalues) for model in logit_models]).T # Predicted probabilities
|
||||||
|
|
||||||
|
# vcov is the variance-covariance matrix of all individually fitted betas across all terms
|
||||||
|
|
||||||
|
# | model 1 | model 2 | model 3 | ...
|
||||||
|
# | term 1 | term 2 | term 1 | term 2 | term 1 | term 2 | ...
|
||||||
|
# model 1 | term 1 |
|
||||||
|
# | term 2 |
|
||||||
|
# model 2 | term 1 |
|
||||||
|
# | term 2 |
|
||||||
|
# ...
|
||||||
|
|
||||||
|
n_terms = len(dmatrix_right.columns) - 1 # number of beta terms (excluding intercept)
|
||||||
|
n_betas = len(logit_models) * n_terms
|
||||||
|
vcov = np.zeros((n_betas, n_betas))
|
||||||
|
|
||||||
|
# Populate the variance-covariance matrix for comparisons between individually fitted models
|
||||||
|
for j in range(0, len(logit_models) - 1):
|
||||||
|
for l in range(j + 1, len(logit_models)):
|
||||||
|
Wjj = np.diag(logit_pihat[:,j] - logit_pihat[:,j] * logit_pihat[:,j])
|
||||||
|
Wjl = np.diag(logit_pihat[:,l] - logit_pihat[:,j] * logit_pihat[:,l])
|
||||||
|
Wll = np.diag(logit_pihat[:,l] - logit_pihat[:,l] * logit_pihat[:,l])
|
||||||
|
|
||||||
|
matrix_result = np.linalg.inv(dmatrix_right.T @ Wjj @ dmatrix_right) @ dmatrix_right.T @ Wjl @ dmatrix_right @ np.linalg.inv(dmatrix_right.T @ Wll @ dmatrix_right)
|
||||||
|
j_vs_l_vcov = np.asarray(matrix_result)[1:,1:] # Asymptotic covariance for j,l
|
||||||
|
|
||||||
|
vcov[j*n_terms:(j+1)*n_terms, l*n_terms:(l+1)*n_terms] = j_vs_l_vcov
|
||||||
|
vcov[l*n_terms:(l+1)*n_terms, j*n_terms:(j+1)*n_terms] = j_vs_l_vcov
|
||||||
|
|
||||||
|
# Populate the variance-covariance matrix within each individually fitted model
|
||||||
|
for i in range(len(logit_models)):
|
||||||
|
vcov[i*n_terms:(i+1)*n_terms, i*n_terms:(i+1)*n_terms] = logit_models[i]._results.cov_params()[1:,1:]
|
||||||
|
|
||||||
|
# ------------------
|
||||||
|
# Perform Wald tests
|
||||||
|
|
||||||
|
beta_names = ['{}_{}'.format(raw_name, i) for i in range(len(logit_models)) for raw_name in dmatrix_right.columns[1:]]
|
||||||
|
wald_results = {}
|
||||||
|
|
||||||
|
# Omnibus test
|
||||||
|
constraints = [' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models))) for raw_name in dmatrix_right.columns[1:]]
|
||||||
|
constraint = ', '.join(constraints)
|
||||||
|
df = (len(logit_models) - 1) * (len(dmatrix_right.columns) - 1) # df = (number of levels minus 2) * (number of terms excluding intercept)
|
||||||
|
wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, df)
|
||||||
|
wald_results['Omnibus'] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue)
|
||||||
|
|
||||||
|
# Individual terms
|
||||||
|
for raw_name in dmatrix_right.columns[1:]:
|
||||||
|
constraint = ' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models)))
|
||||||
|
df = len(logit_models) - 1 # df = (number of levels minus 2)
|
||||||
|
wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, df)
|
||||||
|
wald_results[raw_name] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue)
|
||||||
|
|
||||||
|
return BrantResult(wald_results)
|
||||||
|
@ -483,7 +483,35 @@ def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_cont
|
|||||||
# ------------------------
|
# ------------------------
|
||||||
# Pearson chi-squared test
|
# Pearson chi-squared test
|
||||||
|
|
||||||
class PearsonChiSquaredResult:
|
class ChiSquaredResult:
|
||||||
|
# TODO: Documentation
|
||||||
|
|
||||||
|
def __init__(self, statistic, dof, pvalue):
|
||||||
|
#: *χ*:sup:`2` statistic (*float*)
|
||||||
|
self.statistic = statistic
|
||||||
|
#: Degrees of freedom for the *χ*:sup:`2` distribution (*int*)
|
||||||
|
self.dof = dof
|
||||||
|
#: *p* value for the *χ*:sup:`2` test (*float*)
|
||||||
|
self.pvalue = pvalue
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
if config.repr_is_summary:
|
||||||
|
return self.summary()
|
||||||
|
return super().__repr__()
|
||||||
|
|
||||||
|
def _repr_html_(self):
|
||||||
|
return '<i>χ</i><sup>2</sup>({}) = {:.2f}; <i>p</i> {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
|
||||||
|
|
||||||
|
def summary(self):
|
||||||
|
"""
|
||||||
|
Return a stringified summary of the *χ*:sup:`2` test
|
||||||
|
|
||||||
|
:rtype: str
|
||||||
|
"""
|
||||||
|
|
||||||
|
return 'χ²({}) = {:.2f}; p {}'.format(self.ct, self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION))
|
||||||
|
|
||||||
|
class PearsonChiSquaredResult(ChiSquaredResult):
|
||||||
"""
|
"""
|
||||||
Result of a Pearson *χ*:sup:`2` test
|
Result of a Pearson *χ*:sup:`2` test
|
||||||
|
|
||||||
@ -491,24 +519,15 @@ class PearsonChiSquaredResult:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, ct, statistic, dof, pvalue, oddsratio=None, riskratio=None):
|
def __init__(self, ct, statistic, dof, pvalue, oddsratio=None, riskratio=None):
|
||||||
|
super().__init__(statistic, dof, pvalue)
|
||||||
|
|
||||||
#: Contingency table for the observations (*DataFrame*)
|
#: Contingency table for the observations (*DataFrame*)
|
||||||
self.ct = ct
|
self.ct = ct
|
||||||
#: *χ*:sup:`2` statistic (*float*)
|
|
||||||
self.statistic = statistic
|
|
||||||
#: Degrees of freedom for the *χ*:sup:`2` distribution (*int*)
|
|
||||||
self.dof = dof
|
|
||||||
#: *p* value for the *χ*:sup:`2` test (*float*)
|
|
||||||
self.pvalue = pvalue
|
|
||||||
#: Odds ratio (*float*; *None* if not a 2×2 table)
|
#: Odds ratio (*float*; *None* if not a 2×2 table)
|
||||||
self.oddsratio = oddsratio
|
self.oddsratio = oddsratio
|
||||||
#: Risk ratio (*float*; *None* if not a 2×2 table)
|
#: Risk ratio (*float*; *None* if not a 2×2 table)
|
||||||
self.riskratio = riskratio
|
self.riskratio = riskratio
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
if config.repr_is_summary:
|
|
||||||
return self.summary()
|
|
||||||
return super().__repr__()
|
|
||||||
|
|
||||||
def _repr_html_(self):
|
def _repr_html_(self):
|
||||||
if self.oddsratio is not None:
|
if self.oddsratio is not None:
|
||||||
return '{0}<br><i>χ</i><sup>2</sup>({1}) = {2:.2f}; <i>p</i> {3}<br>OR ({4:g}% CI) = {5}<br>RR ({4:g}% CI) = {6}'.format(
|
return '{0}<br><i>χ</i><sup>2</sup>({1}) = {2:.2f}; <i>p</i> {3}<br>OR ({4:g}% CI) = {5}<br>RR ({4:g}% CI) = {6}'.format(
|
||||||
|
Loading…
Reference in New Issue
Block a user