diff --git a/yli/__init__.py b/yli/__init__.py
index fff4bac..a3c0ce3 100644
--- a/yli/__init__.py
+++ b/yli/__init__.py
@@ -19,7 +19,7 @@ from .config import config
from .descriptives import auto_descriptives
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
-from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, regress_bootstrap, vif
+from .regress import OrdinalLogit, PenalisedLogit, brant, logit_then_regress, regress, regress_bootstrap, vif
from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, ttest_ind
def reload_me():
diff --git a/yli/regress.py b/yli/regress.py
index 7153f61..4d7edf1 100644
--- a/yli/regress.py
+++ b/yli/regress.py
@@ -31,7 +31,7 @@ import warnings
from .bayes_factors import BayesFactor, bayesfactor_afbf
from .config import config
-from .sig_tests import FTestResult
+from .sig_tests import ChiSquaredResult, FTestResult
from .utils import Estimate, PValueStyle, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term
def vif(df, formula=None, *, nan_policy='warn'):
@@ -94,7 +94,7 @@ def vif(df, formula=None, *, nan_policy='warn'):
# ----------
# Regression
-class LikelihoodRatioTestResult:
+class LikelihoodRatioTestResult(ChiSquaredResult):
"""
Result of a likelihood ratio test for regression
@@ -102,17 +102,7 @@ class LikelihoodRatioTestResult:
"""
def __init__(self, statistic, dof, pvalue):
- #: Likelihood ratio test statistic (*float*)
- self.statistic = statistic
- #: Degrees of freedom for the likelihood ratio test statistic (*int*)
- self.dof = dof
- #: *p* value for the likelihood ratio test (*float*)
- self.pvalue = pvalue
-
- def __repr__(self):
- if config.repr_is_summary:
- return self.summary()
- return super().__repr__()
+ super().__init__(statistic, dof, pvalue)
def _repr_html_(self):
return 'LR({}) = {:.2f}; p {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
@@ -913,3 +903,152 @@ class OrdinalLogit(statsmodels.miscmodels.ordinal_model.OrderedModel):
def transform_reverse_threshold_params(self, params):
return params[:-1]
+
+class _Dummy: pass
+
+def _wald_test(param_names, params, formula, vcov, df):
+ # Hack! Piggyback off statsmodels to compute a Wald test
+
+ lmr = statsmodels.base.model.LikelihoodModelResults(model=None, params=None)
+ lmr.model = _Dummy()
+ lmr.model.data = _Dummy()
+ lmr.model.data.cov_names = param_names
+ lmr.params = params
+ lmr.df_resid = df
+
+ return lmr.wald_test(formula, cov_p=vcov, use_f=False, scalar=True)
+
+class BrantResult:
+ # TODO: Documentation
+
+ def __init__(self, tests):
+ #: Results for Brant test on each coefficient (*Dict[str, ChiSquaredResult]*)
+ self.tests = tests
+
+ def __repr__(self):
+ if config.repr_is_summary:
+ return self.summary()
+ return super().__repr__()
+
+ def _repr_html_(self):
+ out = '
Brant Test Results | χ2 | df | p |
'
+
+ for raw_name, test in self.tests.items():
+ out += '{} | {:.2f} | {:.0f} | {} |
'.format(raw_name, test.statistic, test.dof, fmt_p(test.pvalue, PValueStyle.TABULAR | PValueStyle.HTML))
+
+ out += '
'
+ return out
+
+ def summary(self):
+ """
+ Return a stringified summary of the *χ*:sup:`2` test
+
+ :rtype: str
+ """
+
+ # FIXME
+ return 'FIXME'
+
+def brant(
+ df, dep, formula, *,
+ nan_policy='warn',
+ fit_kwargs=None,
+ method=None, maxiter=None, start_params=None, # common fit_kwargs
+):
+ # TODO: Documentation
+
+ if fit_kwargs is None:
+ fit_kwargs = {}
+ if method is not None:
+ fit_kwargs['method'] = method
+ if maxiter is not None:
+ fit_kwargs['maxiter'] = maxiter
+ if start_params is not None:
+ fit_kwargs['start_params'] = start_params
+
+ # Check for/clean NaNs
+ # Following this, we pass nan_policy='raise' to assert no NaNs remaining
+ df = df[[dep] + cols_for_formula(formula, df)]
+ df = check_nan(df, nan_policy)
+
+ # Ensure numeric type for dependent variable
+ if df[dep].dtype != 'float64':
+ df[dep] = df[dep].astype('float64')
+
+ # Convert pandas nullable types for independent variables as this breaks statsmodels
+ df = convert_pandas_nullable(df)
+
+ # Precompute design matrix for RHS
+ # This is also X+ in Brant paper
+ dmatrix_right = patsy.dmatrix(formula, df, return_type='dataframe')
+ dmatrix_right.reset_index(drop=True, inplace=True) # Otherwise this confuses matrix multiplication
+
+ # Fit individual logistic regressions
+ logit_models = []
+ for upper_limit in sorted(df[dep].unique())[:-1]: # FIXME: Sort order
+ dep_dichotomous = (df[dep] <= upper_limit).astype(int).reset_index(drop=True)
+ logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False, **fit_kwargs)
+
+ if not logit_result.mle_retvals['converged']:
+ raise Exception('Maximum likelihood estimation failed to converge for {} <= {}. Check raw_result.mle_retvals.'.format(dep, upper_limit))
+
+ if pd.isna(logit_result.bse).any():
+ raise Exception('Regression returned NaN standard errors for {} <= {}.'.format(dep, upper_limit))
+
+ logit_models.append(logit_result)
+
+ logit_betas = np.array([model._results.params for model in logit_models]).T
+ logit_pihat = np.array([expit(-model.fittedvalues) for model in logit_models]).T # Predicted probabilities
+
+ # vcov is the variance-covariance matrix of all individually fitted betas across all terms
+
+ # | model 1 | model 2 | model 3 | ...
+ # | term 1 | term 2 | term 1 | term 2 | term 1 | term 2 | ...
+ # model 1 | term 1 |
+ # | term 2 |
+ # model 2 | term 1 |
+ # | term 2 |
+ # ...
+
+ n_terms = len(dmatrix_right.columns) - 1 # number of beta terms (excluding intercept)
+ n_betas = len(logit_models) * n_terms
+ vcov = np.zeros((n_betas, n_betas))
+
+ # Populate the variance-covariance matrix for comparisons between individually fitted models
+ for j in range(0, len(logit_models) - 1):
+ for l in range(j + 1, len(logit_models)):
+ Wjj = np.diag(logit_pihat[:,j] - logit_pihat[:,j] * logit_pihat[:,j])
+ Wjl = np.diag(logit_pihat[:,l] - logit_pihat[:,j] * logit_pihat[:,l])
+ Wll = np.diag(logit_pihat[:,l] - logit_pihat[:,l] * logit_pihat[:,l])
+
+ matrix_result = np.linalg.inv(dmatrix_right.T @ Wjj @ dmatrix_right) @ dmatrix_right.T @ Wjl @ dmatrix_right @ np.linalg.inv(dmatrix_right.T @ Wll @ dmatrix_right)
+ j_vs_l_vcov = np.asarray(matrix_result)[1:,1:] # Asymptotic covariance for j,l
+
+ vcov[j*n_terms:(j+1)*n_terms, l*n_terms:(l+1)*n_terms] = j_vs_l_vcov
+ vcov[l*n_terms:(l+1)*n_terms, j*n_terms:(j+1)*n_terms] = j_vs_l_vcov
+
+ # Populate the variance-covariance matrix within each individually fitted model
+ for i in range(len(logit_models)):
+ vcov[i*n_terms:(i+1)*n_terms, i*n_terms:(i+1)*n_terms] = logit_models[i]._results.cov_params()[1:,1:]
+
+ # ------------------
+ # Perform Wald tests
+
+ beta_names = ['{}_{}'.format(raw_name, i) for i in range(len(logit_models)) for raw_name in dmatrix_right.columns[1:]]
+ wald_results = {}
+
+ # Omnibus test
+ constraints = [' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models))) for raw_name in dmatrix_right.columns[1:]]
+ constraint = ', '.join(constraints)
+ df = (len(logit_models) - 1) * (len(dmatrix_right.columns) - 1) # df = (number of levels minus 2) * (number of terms excluding intercept)
+ wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, df)
+ wald_results['Omnibus'] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue)
+
+ # Individual terms
+ for raw_name in dmatrix_right.columns[1:]:
+ constraint = ' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models)))
+ df = len(logit_models) - 1 # df = (number of levels minus 2)
+ wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, df)
+ wald_results[raw_name] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue)
+
+ return BrantResult(wald_results)
diff --git a/yli/sig_tests.py b/yli/sig_tests.py
index 364f63e..14ba7a8 100644
--- a/yli/sig_tests.py
+++ b/yli/sig_tests.py
@@ -483,7 +483,35 @@ def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_cont
# ------------------------
# Pearson chi-squared test
-class PearsonChiSquaredResult:
+class ChiSquaredResult:
+ # TODO: Documentation
+
+ def __init__(self, statistic, dof, pvalue):
+ #: *χ*:sup:`2` statistic (*float*)
+ self.statistic = statistic
+ #: Degrees of freedom for the *χ*:sup:`2` distribution (*int*)
+ self.dof = dof
+ #: *p* value for the *χ*:sup:`2` test (*float*)
+ self.pvalue = pvalue
+
+ def __repr__(self):
+ if config.repr_is_summary:
+ return self.summary()
+ return super().__repr__()
+
+ def _repr_html_(self):
+ return 'χ2({}) = {:.2f}; p {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
+
+ def summary(self):
+ """
+ Return a stringified summary of the *χ*:sup:`2` test
+
+ :rtype: str
+ """
+
+ return 'χ²({}) = {:.2f}; p {}'.format(self.ct, self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION))
+
+class PearsonChiSquaredResult(ChiSquaredResult):
"""
Result of a Pearson *χ*:sup:`2` test
@@ -491,24 +519,15 @@ class PearsonChiSquaredResult:
"""
def __init__(self, ct, statistic, dof, pvalue, oddsratio=None, riskratio=None):
+ super().__init__(statistic, dof, pvalue)
+
#: Contingency table for the observations (*DataFrame*)
self.ct = ct
- #: *χ*:sup:`2` statistic (*float*)
- self.statistic = statistic
- #: Degrees of freedom for the *χ*:sup:`2` distribution (*int*)
- self.dof = dof
- #: *p* value for the *χ*:sup:`2` test (*float*)
- self.pvalue = pvalue
#: Odds ratio (*float*; *None* if not a 2×2 table)
self.oddsratio = oddsratio
#: Risk ratio (*float*; *None* if not a 2×2 table)
self.riskratio = riskratio
- def __repr__(self):
- if config.repr_is_summary:
- return self.summary()
- return super().__repr__()
-
def _repr_html_(self):
if self.oddsratio is not None:
return '{0}
χ2({1}) = {2:.2f}; p {3}
OR ({4:g}% CI) = {5}
RR ({4:g}% CI) = {6}'.format(