Implement yli.OrdinalLogit as preferred model for ordinal logistic regression

OrdinalLogit uses a parameterisation where the cutoff terms are directly incorporated
2022-12-02 20:19:08 +11:00 · 2022-12-02 20:19:08 +11:00 · 0dab62ad0a
commit 0dab62ad0a
parent f8e56d96b1
2 changed files with 41 additions and 22 deletions
--- a/yli/init.py
+++ b/yli/init.py
@ -19,7 +19,7 @@ from .config import config
 from .descriptives import auto_descriptives
 from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
 from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
-from .regress import PenalisedLogit, logit_then_regress, regress, regress_bootstrap, vif
+from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, regress_bootstrap, vif
 from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, ttest_ind

 def reload_me():
--- a/yli/regress.py
+++ b/yli/regress.py
@ -18,10 +18,10 @@ import numpy as np
 import pandas as pd
 import patsy
 from scipy import stats
-import statsmodels
+from scipy.special import expit
+import statsmodels, statsmodels.miscmodels.ordinal_model
 import statsmodels.api as sm
 from statsmodels.iolib.table import SimpleTable
-from statsmodels.miscmodels.ordinal_model import OrderedModel
 from statsmodels.stats.outliers_influence import variance_inflation_factor
 from tqdm import tqdm

@ -334,6 +334,7 @@ class RegressionResult:
 				out += '<tr><th>{}</th><td></td><td style="padding-right:0"></td><td></td><td style="padding-left:0"></td><td></td></tr>'.format(term_name)
 				
 				# Render reference category
+				if term.ref_category is not None:
 					out += '<tr><td style="text-align:right;font-style:italic">{}</td><td>Ref.</td><td style="padding-right:0"></td><td></td><td style="padding-left:0"></td><td></td></tr>'.format(term.ref_category)
 				
 				# Loop over terms
@ -401,6 +402,7 @@ class RegressionResult:
 				table_data.append([term_name + '  ', '', '', '', '', ''])
 				
 				# Render reference category
+				if term.ref_category is not None:
 					table_data.append(['{}  '.format(term.ref_category), 'Ref.', '', '', '', ''])
 				
 				# Loop over terms
@ -546,7 +548,7 @@ def regress(
 	if exp is None:
 		if model_class in (sm.Logit, sm.Poisson, PenalisedLogit):
 			exp = True
-		elif model_class is OrderedModel and model_kwargs.get('distr', 'probit') == 'logit':
+		elif model_class is OrdinalLogit:
 			exp = True
 		else:
 			exp = False
@ -568,7 +570,7 @@ def regress(
 	else:
 		dmatrices = _dmatrices
 	
-	if model_class is OrderedModel:
+	if model_class is OrdinalLogit:
 		# Drop explicit intercept term
 		# FIXME: Check before dropping
 		dmatrices = (dmatrices[0], dmatrices[1].iloc[:,1:])
@ -604,9 +606,11 @@ def regress(
 			# Intercept term (single term)
 			term = '(Intercept)'
 			terms[term] = SingleTerm(raw_name, beta, pvalues[raw_name])
-		elif model_class is OrderedModel and '/' in raw_name:
-			# Ignore ordinal regression intercepts
-			pass
+		elif model_class is OrdinalLogit and '/' in raw_name:
+			# Group ordinal regression cutoffs
+			if '(Cutoffs)' not in terms:
+				terms['(Cutoffs)'] = CategoricalTerm({}, None)
+			terms['(Cutoffs)'].categories[raw_name] = SingleTerm(raw_name, beta, pvalues[raw_name])
 		else:
 			# Parse if required
 			factor, column, contrast = parse_patsy_term(formula, df, raw_name)
@ -628,12 +632,6 @@ def regress(
 				# Single term
 				terms[column] = SingleTerm(raw_name, beta, pvalues[raw_name])
 	
-	# Handle ordinal regression intercepts
-	#if model_class is OrderedModel:
-	#	intercept_names = [raw_name.split('/')[0] for raw_name in model.exog_names if '/' in raw_name]
-	#	intercepts = model.transform_threshold_params(result._results.params[-len(intercept_names):])
-	#	print(intercepts)
-	
 	# Fit null model (for llnull)
 	if hasattr(result, 'llnull'):
 		llnull = result.llnull
@ -664,10 +662,6 @@ def regress(
 	if fit_kwargs.get('cov_type', 'nonrobust') != 'nonrobust':
 		full_name = 'Robust {}'.format(full_name)
 	
-	comments = []
-	if model_class is OrderedModel:
-		comments.append('Cutpoints are omitted from the table of model parameters.')
-	
 	return RegressionResult(
 		result,
 		full_name, model_class.__name__, method_name,
@ -675,7 +669,7 @@ def regress(
 		terms,
 		result.llf, llnull,
 		getattr(result, 'df_resid', None), getattr(result, 'rsquared', None), getattr(result, 'fvalue', None),
-		comments,
+		[],
 		exp
 	)

@ -816,7 +810,7 @@ def logit_then_regress(model_class, df, dep, formula, *, nan_policy='warn', **kw

 class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel):
 	"""
-	statsmodel-compatible model for computing Firth penalised logistic regression
+	statsmodels-compatible model for computing Firth penalised logistic regression
 	
 	Uses the R *logistf* library.
 	
@ -894,3 +888,28 @@ class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel):
 					None  # Set exp in regress()
 				)

+# ------------------------------------------------------
+# Ordinal logistic regression (R/Stata parameterisation)
+
+class OrdinalLogit(statsmodels.miscmodels.ordinal_model.OrderedModel):
+	"""
+	statsmodels-compatible model for computing ordinal logistic (or probit) regression
+	
+	The implementation subclasses statsmodels' native *OrderedModel*, but substitutes an alternative parameterisation used by R and Stata.
+	The the native statsmodels implementation, the first cutoff term is the true cutoff, and further cutoff terms are log differences between consecutive cutoffs.
+	In this parameterisation, cutoff terms are represented directly in the model.
+	"""
+	
+	def __init__(self, endog, exog, **kwargs):
+		if 'distr' not in kwargs:
+			kwargs['distr'] = 'logit'
+		
+		super().__init__(endog, exog, **kwargs)
+	
+	def transform_threshold_params(self, params):
+		th_params = params[-(self.k_levels - 1):]
+		thresh = np.concatenate(([-np.inf], th_params, [np.inf]))
+		return thresh
+	
+	def transform_reverse_threshold_params(self, params):
+		return params[:-1]