Initial implementation of Brant test

2022-12-02 20:20:25 +11:00 · 2022-12-02 20:20:25 +11:00 · fc8303678f
commit fc8303678f
parent 0dab62ad0a
3 changed files with 184 additions and 26 deletions
--- a/yli/init.py
+++ b/yli/init.py
@ -19,7 +19,7 @@ from .config import config
 from .descriptives import auto_descriptives
 from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
 from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
-from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, regress_bootstrap, vif
+from .regress import OrdinalLogit, PenalisedLogit, brant, logit_then_regress, regress, regress_bootstrap, vif
 from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, ttest_ind

 def reload_me():
--- a/yli/regress.py
+++ b/yli/regress.py
@ -31,7 +31,7 @@ import warnings

 from .bayes_factors import BayesFactor, bayesfactor_afbf
 from .config import config
-from .sig_tests import FTestResult
+from .sig_tests import ChiSquaredResult, FTestResult
 from .utils import Estimate, PValueStyle, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term

 def vif(df, formula=None, *, nan_policy='warn'):
@ -94,7 +94,7 @@ def vif(df, formula=None, *, nan_policy='warn'):
 # ----------
 # Regression

-class LikelihoodRatioTestResult:
+class LikelihoodRatioTestResult(ChiSquaredResult):
 	"""
 	Result of a likelihood ratio test for regression
 	
@ -102,17 +102,7 @@ class LikelihoodRatioTestResult:
 	"""
 	
 	def __init__(self, statistic, dof, pvalue):
-		#: Likelihood ratio test statistic (*float*)
-		self.statistic = statistic
-		#: Degrees of freedom for the likelihood ratio test statistic (*int*)
-		self.dof = dof
-		#: *p* value for the likelihood ratio test (*float*)
-		self.pvalue = pvalue
-	
-	def __repr__(self):
-		if config.repr_is_summary:
-			return self.summary()
-		return super().__repr__()
+		super().__init__(statistic, dof, pvalue)
 	
 	def _repr_html_(self):
 		return 'LR({}) = {:.2f}; <i>p</i> {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
@ -913,3 +903,152 @@ class OrdinalLogit(statsmodels.miscmodels.ordinal_model.OrderedModel):
 	
 	def transform_reverse_threshold_params(self, params):
 		return params[:-1]
+
+class _Dummy: pass
+
+def _wald_test(param_names, params, formula, vcov, df):
+	# Hack! Piggyback off statsmodels to compute a Wald test
+	
+	lmr = statsmodels.base.model.LikelihoodModelResults(model=None, params=None)
+	lmr.model = _Dummy()
+	lmr.model.data = _Dummy()
+	lmr.model.data.cov_names = param_names
+	lmr.params = params
+	lmr.df_resid = df
+	
+	return lmr.wald_test(formula, cov_p=vcov, use_f=False, scalar=True)
+
+class BrantResult:
+	# TODO: Documentation
+	
+	def __init__(self, tests):
+		#: Results for Brant test on each coefficient (*Dict[str, ChiSquaredResult]*)
+		self.tests = tests
+	
+	def __repr__(self):
+		if config.repr_is_summary:
+			return self.summary()
+		return super().__repr__()
+	
+	def _repr_html_(self):
+		out = '<table><caption>Brant Test Results</caption><thead><tr><th></th><th style="text-align:center"><i>χ</i><sup>2</sup></th><th style="text-align:center">df</th><th style="text-align:center"><i>p</i></th></thead><tbody>'
+		
+		for raw_name, test in self.tests.items():
+			out += '<tr><th>{}</th><td>{:.2f}</td><td>{:.0f}</td><td style="text-align:left">{}</td></tr>'.format(raw_name, test.statistic, test.dof, fmt_p(test.pvalue, PValueStyle.TABULAR | PValueStyle.HTML))
+		
+		out += '</tbody></table>'
+		return out
+	
+	def summary(self):
+		"""
+		Return a stringified summary of the *χ*:sup:`2` test
+		
+		:rtype: str
+		"""
+		
+		# FIXME
+		return 'FIXME'
+
+def brant(
+	df, dep, formula, *,
+	nan_policy='warn',
+	fit_kwargs=None,
+	method=None, maxiter=None, start_params=None,  # common fit_kwargs
+):
+	# TODO: Documentation
+	
+	if fit_kwargs is None:
+		fit_kwargs = {}
+	if method is not None:
+		fit_kwargs['method'] = method
+	if maxiter is not None:
+		fit_kwargs['maxiter'] = maxiter
+	if start_params is not None:
+		fit_kwargs['start_params'] = start_params
+	
+	# Check for/clean NaNs
+	# Following this, we pass nan_policy='raise' to assert no NaNs remaining
+	df = df[[dep] + cols_for_formula(formula, df)]
+	df = check_nan(df, nan_policy)
+	
+	# Ensure numeric type for dependent variable
+	if df[dep].dtype != 'float64':
+		df[dep] = df[dep].astype('float64')
+	
+	# Convert pandas nullable types for independent variables as this breaks statsmodels
+	df = convert_pandas_nullable(df)
+	
+	# Precompute design matrix for RHS
+	# This is also X+ in Brant paper
+	dmatrix_right = patsy.dmatrix(formula, df, return_type='dataframe')
+	dmatrix_right.reset_index(drop=True, inplace=True)  # Otherwise this confuses matrix multiplication
+	
+	# Fit individual logistic regressions
+	logit_models = []
+	for upper_limit in sorted(df[dep].unique())[:-1]:  # FIXME: Sort order
+		dep_dichotomous = (df[dep] <= upper_limit).astype(int).reset_index(drop=True)
+		logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False, **fit_kwargs)
+		
+		if not logit_result.mle_retvals['converged']:
+			raise Exception('Maximum likelihood estimation failed to converge for {} <= {}. Check raw_result.mle_retvals.'.format(dep, upper_limit))
+		
+		if pd.isna(logit_result.bse).any():
+			raise Exception('Regression returned NaN standard errors for {} <= {}.'.format(dep, upper_limit))
+		
+		logit_models.append(logit_result)
+	
+	logit_betas = np.array([model._results.params for model in logit_models]).T
+	logit_pihat = np.array([expit(-model.fittedvalues) for model in logit_models]).T  # Predicted probabilities
+	
+	# vcov is the variance-covariance matrix of all individually fitted betas across all terms
+	
+	#                  |     model 1     |     model 2     |     model 3     | ...
+	#                  | term 1 | term 2 | term 1 | term 2 | term 1 | term 2 | ...
+	# model 1 | term 1 |
+	#         | term 2 |
+	# model 2 | term 1 |
+	#         | term 2 |
+	#   ...
+	
+	n_terms = len(dmatrix_right.columns) - 1  # number of beta terms (excluding intercept)
+	n_betas = len(logit_models) * n_terms
+	vcov = np.zeros((n_betas, n_betas))
+	
+	# Populate the variance-covariance matrix for comparisons between individually fitted models
+	for j in range(0, len(logit_models) - 1):
+		for l in range(j + 1, len(logit_models)):
+			Wjj = np.diag(logit_pihat[:,j] - logit_pihat[:,j] * logit_pihat[:,j])
+			Wjl = np.diag(logit_pihat[:,l] - logit_pihat[:,j] * logit_pihat[:,l])
+			Wll = np.diag(logit_pihat[:,l] - logit_pihat[:,l] * logit_pihat[:,l])
+			
+			matrix_result = np.linalg.inv(dmatrix_right.T @ Wjj @ dmatrix_right) @ dmatrix_right.T @ Wjl @ dmatrix_right @ np.linalg.inv(dmatrix_right.T @ Wll @ dmatrix_right)
+			j_vs_l_vcov = np.asarray(matrix_result)[1:,1:]  # Asymptotic covariance for j,l
+			
+			vcov[j*n_terms:(j+1)*n_terms, l*n_terms:(l+1)*n_terms] = j_vs_l_vcov
+			vcov[l*n_terms:(l+1)*n_terms, j*n_terms:(j+1)*n_terms] = j_vs_l_vcov
+	
+	# Populate the variance-covariance matrix within each individually fitted model
+	for i in range(len(logit_models)):
+		vcov[i*n_terms:(i+1)*n_terms, i*n_terms:(i+1)*n_terms] = logit_models[i]._results.cov_params()[1:,1:]
+	
+	# ------------------
+	# Perform Wald tests
+	
+	beta_names = ['{}_{}'.format(raw_name, i) for i in range(len(logit_models)) for raw_name in dmatrix_right.columns[1:]]
+	wald_results = {}
+	
+	# Omnibus test
+	constraints = [' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models))) for raw_name in dmatrix_right.columns[1:]]
+	constraint = ', '.join(constraints)
+	df = (len(logit_models) - 1) * (len(dmatrix_right.columns) - 1)  # df = (number of levels minus 2) * (number of terms excluding intercept)
+	wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, df)
+	wald_results['Omnibus'] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue)
+	
+	# Individual terms
+	for raw_name in dmatrix_right.columns[1:]:
+		constraint = ' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models)))
+		df = len(logit_models) - 1  # df = (number of levels minus 2)
+		wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, df)
+		wald_results[raw_name] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue)
+	
+	return BrantResult(wald_results)
--- a/yli/sig_tests.py
+++ b/yli/sig_tests.py
@ -483,7 +483,35 @@ def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_cont
 # ------------------------
 # Pearson chi-squared test

-class PearsonChiSquaredResult:
+class ChiSquaredResult:
+	# TODO: Documentation
+	
+	def __init__(self, statistic, dof, pvalue):
+		#: *χ*:sup:`2` statistic (*float*)
+		self.statistic = statistic
+		#: Degrees of freedom for the *χ*:sup:`2` distribution (*int*)
+		self.dof = dof
+		#: *p* value for the *χ*:sup:`2` test (*float*)
+		self.pvalue = pvalue
+	
+	def __repr__(self):
+		if config.repr_is_summary:
+			return self.summary()
+		return super().__repr__()
+	
+	def _repr_html_(self):
+		return '<i>χ</i><sup>2</sup>({}) = {:.2f}; <i>p</i> {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
+	
+	def summary(self):
+		"""
+		Return a stringified summary of the *χ*:sup:`2` test
+		
+		:rtype: str
+		"""
+		
+		return 'χ²({}) = {:.2f}; p {}'.format(self.ct, self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION))
+
+class PearsonChiSquaredResult(ChiSquaredResult):
 	"""
 	Result of a Pearson *χ*:sup:`2` test
 	
@ -491,24 +519,15 @@ class PearsonChiSquaredResult:
 	"""
 	
 	def __init__(self, ct, statistic, dof, pvalue, oddsratio=None, riskratio=None):
+		super().__init__(statistic, dof, pvalue)
+		
 		#: Contingency table for the observations (*DataFrame*)
 		self.ct = ct
-		#: *χ*:sup:`2` statistic (*float*)
-		self.statistic = statistic
-		#: Degrees of freedom for the *χ*:sup:`2` distribution (*int*)
-		self.dof = dof
-		#: *p* value for the *χ*:sup:`2` test (*float*)
-		self.pvalue = pvalue
 		#: Odds ratio (*float*; *None* if not a 2×2 table)
 		self.oddsratio = oddsratio
 		#: Risk ratio (*float*; *None* if not a 2×2 table)
 		self.riskratio = riskratio
 	
-	def __repr__(self):
-		if config.repr_is_summary:
-			return self.summary()
-		return super().__repr__()
-	
 	def _repr_html_(self):
 		if self.oddsratio is not None:
 			return '{0}<br><i>χ</i><sup>2</sup>({1}) = {2:.2f}; <i>p</i> {3}<br>OR ({4:g}% CI) = {5}<br>RR ({4:g}% CI) = {6}'.format(