Initial implementation of Brant test

2022-12-02 20:20:25 +11:00 · 2022-12-02 20:20:25 +11:00 · fc8303678f
commit fc8303678f
parent 0dab62ad0a
3 changed files with 184 additions and 26 deletions
--- a/yli/init.py
+++ b/yli/init.py
@ -19,7 +19,7 @@ from .config import config
 from .descriptives import auto_descriptives
 from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
 from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
-from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, regress_bootstrap, vif
+from .regress import OrdinalLogit, PenalisedLogit, brant, logit_then_regress, regress, regress_bootstrap, vif
 from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, ttest_ind
 def reload_me():
--- a/yli/regress.py
+++ b/yli/regress.py
@ -31,7 +31,7 @@ import warnings
 from .bayes_factors import BayesFactor, bayesfactor_afbf
 from .config import config
-from .sig_tests import FTestResult
+from .sig_tests import ChiSquaredResult, FTestResult
 from .utils import Estimate, PValueStyle, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term
 def vif(df, formula=None, *, nan_policy='warn'):
@ -94,7 +94,7 @@ def vif(df, formula=None, *, nan_policy='warn'):
 # ----------
 # Regression
-class LikelihoodRatioTestResult:
+class LikelihoodRatioTestResult(ChiSquaredResult):
 	"""
 	Result of a likelihood ratio test for regression
@ -102,17 +102,7 @@ class LikelihoodRatioTestResult:
 	"""
 	def __init__(self, statistic, dof, pvalue):
-		#: Likelihood ratio test statistic (*float*)
+		super().__init__(statistic, dof, pvalue)
 		self.statistic = statistic
 		#: Degrees of freedom for the likelihood ratio test statistic (*int*)
 		self.dof = dof
 		#: *p* value for the likelihood ratio test (*float*)
 		self.pvalue = pvalue
 	def __repr__(self):
 		if config.repr_is_summary:
 			return self.summary()
 		return super().__repr__()
 	def _repr_html_(self):
 		return 'LR({}) = {:.2f}; <i>p</i> {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
@ -913,3 +903,152 @@ class OrdinalLogit(statsmodels.miscmodels.ordinal_model.OrderedModel):
 	def transform_reverse_threshold_params(self, params):
 		return params[:-1]
 class _Dummy: pass
 def _wald_test(param_names, params, formula, vcov, df):
 	# Hack! Piggyback off statsmodels to compute a Wald test
 	lmr = statsmodels.base.model.LikelihoodModelResults(model=None, params=None)
 	lmr.model = _Dummy()
 	lmr.model.data = _Dummy()
 	lmr.model.data.cov_names = param_names
 	lmr.params = params
 	lmr.df_resid = df
 	return lmr.wald_test(formula, cov_p=vcov, use_f=False, scalar=True)
 class BrantResult:
 	# TODO: Documentation
 	def __init__(self, tests):
 		#: Results for Brant test on each coefficient (*Dict[str, ChiSquaredResult]*)
 		self.tests = tests
 	def __repr__(self):
 		if config.repr_is_summary:
 			return self.summary()
 		return super().__repr__()
 	def _repr_html_(self):
 		out = '<table><caption>Brant Test Results</caption><thead><tr><th></th><th style="text-align:center"><i>χ</i><sup>2</sup></th><th style="text-align:center">df</th><th style="text-align:center"><i>p</i></th></thead><tbody>'
 		for raw_name, test in self.tests.items():
 			out += '<tr><th>{}</th><td>{:.2f}</td><td>{:.0f}</td><td style="text-align:left">{}</td></tr>'.format(raw_name, test.statistic, test.dof, fmt_p(test.pvalue, PValueStyle.TABULAR | PValueStyle.HTML))
 		out += '</tbody></table>'
 		return out
 	def summary(self):
 		"""
 		Return a stringified summary of the *χ*:sup:`2` test
 		:rtype: str
 		"""
 		# FIXME
 		return 'FIXME'
 def brant(
 	df, dep, formula, *,
 	nan_policy='warn',
 	fit_kwargs=None,
 	method=None, maxiter=None, start_params=None,  # common fit_kwargs
 ):
 	# TODO: Documentation
 	if fit_kwargs is None:
 		fit_kwargs = {}
 	if method is not None:
 		fit_kwargs['method'] = method
 	if maxiter is not None:
 		fit_kwargs['maxiter'] = maxiter
 	if start_params is not None:
 		fit_kwargs['start_params'] = start_params
 	# Check for/clean NaNs
 	# Following this, we pass nan_policy='raise' to assert no NaNs remaining
 	df = df[[dep] + cols_for_formula(formula, df)]
 	df = check_nan(df, nan_policy)
 	# Ensure numeric type for dependent variable
 	if df[dep].dtype != 'float64':
 		df[dep] = df[dep].astype('float64')
 	# Convert pandas nullable types for independent variables as this breaks statsmodels
 	df = convert_pandas_nullable(df)
 	# Precompute design matrix for RHS
 	# This is also X+ in Brant paper
 	dmatrix_right = patsy.dmatrix(formula, df, return_type='dataframe')
 	dmatrix_right.reset_index(drop=True, inplace=True)  # Otherwise this confuses matrix multiplication
 	# Fit individual logistic regressions
 	logit_models = []
 	for upper_limit in sorted(df[dep].unique())[:-1]:  # FIXME: Sort order
 		dep_dichotomous = (df[dep] <= upper_limit).astype(int).reset_index(drop=True)
 		logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False, **fit_kwargs)
 		if not logit_result.mle_retvals['converged']:
 			raise Exception('Maximum likelihood estimation failed to converge for {} <= {}. Check raw_result.mle_retvals.'.format(dep, upper_limit))
 		if pd.isna(logit_result.bse).any():
 			raise Exception('Regression returned NaN standard errors for {} <= {}.'.format(dep, upper_limit))
 		logit_models.append(logit_result)
 	logit_betas = np.array([model._results.params for model in logit_models]).T
 	logit_pihat = np.array([expit(-model.fittedvalues) for model in logit_models]).T  # Predicted probabilities
 	# vcov is the variance-covariance matrix of all individually fitted betas across all terms
 	#                  |     model 1     |     model 2     |     model 3     | ...
 	#                  | term 1 | term 2 | term 1 | term 2 | term 1 | term 2 | ...
 	# model 1 | term 1 |
 	#         | term 2 |
 	# model 2 | term 1 |
 	#         | term 2 |
 	#   ...
 	n_terms = len(dmatrix_right.columns) - 1  # number of beta terms (excluding intercept)
 	n_betas = len(logit_models) * n_terms
 	vcov = np.zeros((n_betas, n_betas))
 	# Populate the variance-covariance matrix for comparisons between individually fitted models
 	for j in range(0, len(logit_models) - 1):
 		for l in range(j + 1, len(logit_models)):
 			Wjj = np.diag(logit_pihat[:,j] - logit_pihat[:,j] * logit_pihat[:,j])
 			Wjl = np.diag(logit_pihat[:,l] - logit_pihat[:,j] * logit_pihat[:,l])
 			Wll = np.diag(logit_pihat[:,l] - logit_pihat[:,l] * logit_pihat[:,l])
 			matrix_result = np.linalg.inv(dmatrix_right.T @ Wjj @ dmatrix_right) @ dmatrix_right.T @ Wjl @ dmatrix_right @ np.linalg.inv(dmatrix_right.T @ Wll @ dmatrix_right)
 			j_vs_l_vcov = np.asarray(matrix_result)[1:,1:]  # Asymptotic covariance for j,l
 			vcov[j*n_terms:(j+1)*n_terms, l*n_terms:(l+1)*n_terms] = j_vs_l_vcov
 			vcov[l*n_terms:(l+1)*n_terms, j*n_terms:(j+1)*n_terms] = j_vs_l_vcov
 	# Populate the variance-covariance matrix within each individually fitted model
 	for i in range(len(logit_models)):
 		vcov[i*n_terms:(i+1)*n_terms, i*n_terms:(i+1)*n_terms] = logit_models[i]._results.cov_params()[1:,1:]
 	# ------------------
 	# Perform Wald tests
 	beta_names = ['{}_{}'.format(raw_name, i) for i in range(len(logit_models)) for raw_name in dmatrix_right.columns[1:]]
 	wald_results = {}
 	# Omnibus test
 	constraints = [' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models))) for raw_name in dmatrix_right.columns[1:]]
 	constraint = ', '.join(constraints)
 	df = (len(logit_models) - 1) * (len(dmatrix_right.columns) - 1)  # df = (number of levels minus 2) * (number of terms excluding intercept)
 	wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, df)
 	wald_results['Omnibus'] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue)
 	# Individual terms
 	for raw_name in dmatrix_right.columns[1:]:
 		constraint = ' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models)))
 		df = len(logit_models) - 1  # df = (number of levels minus 2)
 		wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, df)
 		wald_results[raw_name] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue)
 	return BrantResult(wald_results)
--- a/yli/sig_tests.py
+++ b/yli/sig_tests.py
@ -483,7 +483,35 @@ def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_cont
 # ------------------------
 # Pearson chi-squared test
-class PearsonChiSquaredResult:
+class ChiSquaredResult:
 	# TODO: Documentation
 	def __init__(self, statistic, dof, pvalue):
 		#: *χ*:sup:`2` statistic (*float*)
 		self.statistic = statistic
 		#: Degrees of freedom for the *χ*:sup:`2` distribution (*int*)
 		self.dof = dof
 		#: *p* value for the *χ*:sup:`2` test (*float*)
 		self.pvalue = pvalue
 	def __repr__(self):
 		if config.repr_is_summary:
 			return self.summary()
 		return super().__repr__()
 	def _repr_html_(self):
 		return '<i>χ</i><sup>2</sup>({}) = {:.2f}; <i>p</i> {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
 	def summary(self):
 		"""
 		Return a stringified summary of the *χ*:sup:`2` test
 		:rtype: str
 		"""
 		return 'χ²({}) = {:.2f}; p {}'.format(self.ct, self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION))
 class PearsonChiSquaredResult(ChiSquaredResult):
 	"""
 	Result of a Pearson *χ*:sup:`2` test
@ -491,24 +519,15 @@ class PearsonChiSquaredResult:
 	"""
 	def __init__(self, ct, statistic, dof, pvalue, oddsratio=None, riskratio=None):
 		super().__init__(statistic, dof, pvalue)
 		#: Contingency table for the observations (*DataFrame*)
 		self.ct = ct
 		#: *χ*:sup:`2` statistic (*float*)
 		self.statistic = statistic
 		#: Degrees of freedom for the *χ*:sup:`2` distribution (*int*)
 		self.dof = dof
 		#: *p* value for the *χ*:sup:`2` test (*float*)
 		self.pvalue = pvalue
 		#: Odds ratio (*float*; *None* if not a 2×2 table)
 		self.oddsratio = oddsratio
 		#: Risk ratio (*float*; *None* if not a 2×2 table)
 		self.riskratio = riskratio
 	def __repr__(self):
 		if config.repr_is_summary:
 			return self.summary()
 		return super().__repr__()
 	def _repr_html_(self):
 		if self.oddsratio is not None:
 			return '{0}<br><i>χ</i><sup>2</sup>({1}) = {2:.2f}; <i>p</i> {3}<br>OR ({4:g}% CI) = {5}<br>RR ({4:g}% CI) = {6}'.format(