From 0dab62ad0a35af01f5bd46fe559dd96f678464ec Mon Sep 17 00:00:00 2001 From: RunasSudo Date: Fri, 2 Dec 2022 20:19:08 +1100 Subject: [PATCH] Implement yli.OrdinalLogit as preferred model for ordinal logistic regression OrdinalLogit uses a parameterisation where the cutoff terms are directly incorporated --- yli/__init__.py | 2 +- yli/regress.py | 61 ++++++++++++++++++++++++++++++++----------------- 2 files changed, 41 insertions(+), 22 deletions(-) diff --git a/yli/__init__.py b/yli/__init__.py index baa73ad..fff4bac 100644 --- a/yli/__init__.py +++ b/yli/__init__.py @@ -19,7 +19,7 @@ from .config import config from .descriptives import auto_descriptives from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted -from .regress import PenalisedLogit, logit_then_regress, regress, regress_bootstrap, vif +from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, regress_bootstrap, vif from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, ttest_ind def reload_me(): diff --git a/yli/regress.py b/yli/regress.py index 1769381..7153f61 100644 --- a/yli/regress.py +++ b/yli/regress.py @@ -18,10 +18,10 @@ import numpy as np import pandas as pd import patsy from scipy import stats -import statsmodels +from scipy.special import expit +import statsmodels, statsmodels.miscmodels.ordinal_model import statsmodels.api as sm from statsmodels.iolib.table import SimpleTable -from statsmodels.miscmodels.ordinal_model import OrderedModel from statsmodels.stats.outliers_influence import variance_inflation_factor from tqdm import tqdm @@ -334,7 +334,8 @@ class RegressionResult: out += '{}'.format(term_name) # Render reference category - out += '{}Ref.'.format(term.ref_category) + if term.ref_category is not None: + out += '{}Ref.'.format(term.ref_category) # Loop over terms for sub_term_name, sub_term in term.categories.items(): @@ -401,7 +402,8 @@ class RegressionResult: table_data.append([term_name + ' ', '', '', '', '', '']) # Render reference category - table_data.append(['{} '.format(term.ref_category), 'Ref.', '', '', '', '']) + if term.ref_category is not None: + table_data.append(['{} '.format(term.ref_category), 'Ref.', '', '', '', '']) # Loop over terms for sub_term_name, sub_term in term.categories.items(): @@ -546,7 +548,7 @@ def regress( if exp is None: if model_class in (sm.Logit, sm.Poisson, PenalisedLogit): exp = True - elif model_class is OrderedModel and model_kwargs.get('distr', 'probit') == 'logit': + elif model_class is OrdinalLogit: exp = True else: exp = False @@ -568,7 +570,7 @@ def regress( else: dmatrices = _dmatrices - if model_class is OrderedModel: + if model_class is OrdinalLogit: # Drop explicit intercept term # FIXME: Check before dropping dmatrices = (dmatrices[0], dmatrices[1].iloc[:,1:]) @@ -604,9 +606,11 @@ def regress( # Intercept term (single term) term = '(Intercept)' terms[term] = SingleTerm(raw_name, beta, pvalues[raw_name]) - elif model_class is OrderedModel and '/' in raw_name: - # Ignore ordinal regression intercepts - pass + elif model_class is OrdinalLogit and '/' in raw_name: + # Group ordinal regression cutoffs + if '(Cutoffs)' not in terms: + terms['(Cutoffs)'] = CategoricalTerm({}, None) + terms['(Cutoffs)'].categories[raw_name] = SingleTerm(raw_name, beta, pvalues[raw_name]) else: # Parse if required factor, column, contrast = parse_patsy_term(formula, df, raw_name) @@ -628,12 +632,6 @@ def regress( # Single term terms[column] = SingleTerm(raw_name, beta, pvalues[raw_name]) - # Handle ordinal regression intercepts - #if model_class is OrderedModel: - # intercept_names = [raw_name.split('/')[0] for raw_name in model.exog_names if '/' in raw_name] - # intercepts = model.transform_threshold_params(result._results.params[-len(intercept_names):]) - # print(intercepts) - # Fit null model (for llnull) if hasattr(result, 'llnull'): llnull = result.llnull @@ -664,10 +662,6 @@ def regress( if fit_kwargs.get('cov_type', 'nonrobust') != 'nonrobust': full_name = 'Robust {}'.format(full_name) - comments = [] - if model_class is OrderedModel: - comments.append('Cutpoints are omitted from the table of model parameters.') - return RegressionResult( result, full_name, model_class.__name__, method_name, @@ -675,7 +669,7 @@ def regress( terms, result.llf, llnull, getattr(result, 'df_resid', None), getattr(result, 'rsquared', None), getattr(result, 'fvalue', None), - comments, + [], exp ) @@ -816,7 +810,7 @@ def logit_then_regress(model_class, df, dep, formula, *, nan_policy='warn', **kw class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel): """ - statsmodel-compatible model for computing Firth penalised logistic regression + statsmodels-compatible model for computing Firth penalised logistic regression Uses the R *logistf* library. @@ -894,3 +888,28 @@ class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel): None # Set exp in regress() ) +# ------------------------------------------------------ +# Ordinal logistic regression (R/Stata parameterisation) + +class OrdinalLogit(statsmodels.miscmodels.ordinal_model.OrderedModel): + """ + statsmodels-compatible model for computing ordinal logistic (or probit) regression + + The implementation subclasses statsmodels' native *OrderedModel*, but substitutes an alternative parameterisation used by R and Stata. + The the native statsmodels implementation, the first cutoff term is the true cutoff, and further cutoff terms are log differences between consecutive cutoffs. + In this parameterisation, cutoff terms are represented directly in the model. + """ + + def __init__(self, endog, exog, **kwargs): + if 'distr' not in kwargs: + kwargs['distr'] = 'logit' + + super().__init__(endog, exog, **kwargs) + + def transform_threshold_params(self, params): + th_params = params[-(self.k_levels - 1):] + thresh = np.concatenate(([-np.inf], th_params, [np.inf])) + return thresh + + def transform_reverse_threshold_params(self, params): + return params[:-1]