From 0dab62ad0a35af01f5bd46fe559dd96f678464ec Mon Sep 17 00:00:00 2001
From: RunasSudo <runassudo@yingtongli.me>
Date: Fri, 2 Dec 2022 20:19:08 +1100
Subject: [PATCH] Implement yli.OrdinalLogit as preferred model for ordinal
 logistic regression

OrdinalLogit uses a parameterisation where the cutoff terms are directly incorporated
---
 yli/__init__.py |  2 +-
 yli/regress.py  | 61 ++++++++++++++++++++++++++++++++-----------------
 2 files changed, 41 insertions(+), 22 deletions(-)
diff --git a/yli/__init__.py b/yli/__init__.py
index baa73ad..fff4bac 100644
--- a/yli/__init__.py
+++ b/yli/__init__.py
@@ -19,7 +19,7 @@ from .config import config
 from .descriptives import auto_descriptives
 from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
 from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
-from .regress import PenalisedLogit, logit_then_regress, regress, regress_bootstrap, vif
+from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, regress_bootstrap, vif
 from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, ttest_ind
 
 def reload_me():
diff --git a/yli/regress.py b/yli/regress.py
index 1769381..7153f61 100644
--- a/yli/regress.py
+++ b/yli/regress.py
@@ -18,10 +18,10 @@ import numpy as np
 import pandas as pd
 import patsy
 from scipy import stats
-import statsmodels
+from scipy.special import expit
+import statsmodels, statsmodels.miscmodels.ordinal_model
 import statsmodels.api as sm
 from statsmodels.iolib.table import SimpleTable
-from statsmodels.miscmodels.ordinal_model import OrderedModel
 from statsmodels.stats.outliers_influence import variance_inflation_factor
 from tqdm import tqdm
 
@@ -334,7 +334,8 @@ class RegressionResult:
 				out += '<tr><th>{}</th><td></td><td style="padding-right:0"></td><td></td><td style="padding-left:0"></td><td></td></tr>'.format(term_name)
 				
 				# Render reference category
-				out += '<tr><td style="text-align:right;font-style:italic">{}</td><td>Ref.</td><td style="padding-right:0"></td><td></td><td style="padding-left:0"></td><td></td></tr>'.format(term.ref_category)
+				if term.ref_category is not None:
+					out += '<tr><td style="text-align:right;font-style:italic">{}</td><td>Ref.</td><td style="padding-right:0"></td><td></td><td style="padding-left:0"></td><td></td></tr>'.format(term.ref_category)
 				
 				# Loop over terms
 				for sub_term_name, sub_term in term.categories.items():
@@ -401,7 +402,8 @@ class RegressionResult:
 				table_data.append([term_name + '  ', '', '', '', '', ''])
 				
 				# Render reference category
-				table_data.append(['{}  '.format(term.ref_category), 'Ref.', '', '', '', ''])
+				if term.ref_category is not None:
+					table_data.append(['{}  '.format(term.ref_category), 'Ref.', '', '', '', ''])
 				
 				# Loop over terms
 				for sub_term_name, sub_term in term.categories.items():
@@ -546,7 +548,7 @@ def regress(
 	if exp is None:
 		if model_class in (sm.Logit, sm.Poisson, PenalisedLogit):
 			exp = True
-		elif model_class is OrderedModel and model_kwargs.get('distr', 'probit') == 'logit':
+		elif model_class is OrdinalLogit:
 			exp = True
 		else:
 			exp = False
@@ -568,7 +570,7 @@ def regress(
 	else:
 		dmatrices = _dmatrices
 	
-	if model_class is OrderedModel:
+	if model_class is OrdinalLogit:
 		# Drop explicit intercept term
 		# FIXME: Check before dropping
 		dmatrices = (dmatrices[0], dmatrices[1].iloc[:,1:])
@@ -604,9 +606,11 @@ def regress(
 			# Intercept term (single term)
 			term = '(Intercept)'
 			terms[term] = SingleTerm(raw_name, beta, pvalues[raw_name])
-		elif model_class is OrderedModel and '/' in raw_name:
-			# Ignore ordinal regression intercepts
-			pass
+		elif model_class is OrdinalLogit and '/' in raw_name:
+			# Group ordinal regression cutoffs
+			if '(Cutoffs)' not in terms:
+				terms['(Cutoffs)'] = CategoricalTerm({}, None)
+			terms['(Cutoffs)'].categories[raw_name] = SingleTerm(raw_name, beta, pvalues[raw_name])
 		else:
 			# Parse if required
 			factor, column, contrast = parse_patsy_term(formula, df, raw_name)
@@ -628,12 +632,6 @@ def regress(
 				# Single term
 				terms[column] = SingleTerm(raw_name, beta, pvalues[raw_name])
 	
-	# Handle ordinal regression intercepts
-	#if model_class is OrderedModel:
-	#	intercept_names = [raw_name.split('/')[0] for raw_name in model.exog_names if '/' in raw_name]
-	#	intercepts = model.transform_threshold_params(result._results.params[-len(intercept_names):])
-	#	print(intercepts)
-	
 	# Fit null model (for llnull)
 	if hasattr(result, 'llnull'):
 		llnull = result.llnull
@@ -664,10 +662,6 @@ def regress(
 	if fit_kwargs.get('cov_type', 'nonrobust') != 'nonrobust':
 		full_name = 'Robust {}'.format(full_name)
 	
-	comments = []
-	if model_class is OrderedModel:
-		comments.append('Cutpoints are omitted from the table of model parameters.')
-	
 	return RegressionResult(
 		result,
 		full_name, model_class.__name__, method_name,
@@ -675,7 +669,7 @@ def regress(
 		terms,
 		result.llf, llnull,
 		getattr(result, 'df_resid', None), getattr(result, 'rsquared', None), getattr(result, 'fvalue', None),
-		comments,
+		[],
 		exp
 	)
 
@@ -816,7 +810,7 @@ def logit_then_regress(model_class, df, dep, formula, *, nan_policy='warn', **kw
 
 class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel):
 	"""
-	statsmodel-compatible model for computing Firth penalised logistic regression
+	statsmodels-compatible model for computing Firth penalised logistic regression
 	
 	Uses the R *logistf* library.
 	
@@ -894,3 +888,28 @@ class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel):
 					None  # Set exp in regress()
 				)
 
+# ------------------------------------------------------
+# Ordinal logistic regression (R/Stata parameterisation)
+
+class OrdinalLogit(statsmodels.miscmodels.ordinal_model.OrderedModel):
+	"""
+	statsmodels-compatible model for computing ordinal logistic (or probit) regression
+	
+	The implementation subclasses statsmodels' native *OrderedModel*, but substitutes an alternative parameterisation used by R and Stata.
+	The the native statsmodels implementation, the first cutoff term is the true cutoff, and further cutoff terms are log differences between consecutive cutoffs.
+	In this parameterisation, cutoff terms are represented directly in the model.
+	"""
+	
+	def __init__(self, endog, exog, **kwargs):
+		if 'distr' not in kwargs:
+			kwargs['distr'] = 'logit'
+		
+		super().__init__(endog, exog, **kwargs)
+	
+	def transform_threshold_params(self, params):
+		th_params = params[-(self.k_levels - 1):]
+		thresh = np.concatenate(([-np.inf], th_params, [np.inf]))
+		return thresh
+	
+	def transform_reverse_threshold_params(self, params):
+		return params[:-1]