Implement PenalisedLogit

2022-10-13 17:23:29 +11:00 · 2022-10-13 17:23:29 +11:00 · 461e00df78
commit 461e00df78
parent 51008296c2
3 changed files with 82 additions and 2 deletions
--- a/tests/test_regress.py
+++ b/tests/test_regress.py
@ -121,3 +121,28 @@ def test_regress_logit_ol12_23():
 	assert expbeta_gam.point == approx(1.169, abs=0.001)
 	assert expbeta_gam.ci_lower == approx(0.924, abs=0.001)
 	assert expbeta_gam.ci_upper == approx(1.477, abs=0.001)
+
+def test_regress_penalisedlogit_kleinman():
+	"""Compare yli.regress with yli.PenalisedLogit for http://sas-and-r.blogspot.com/2010/11/example-815-firth-logistic-regression.html"""
+	
+	df = pd.DataFrame({
+		'Pred': [1] * 20 + [0] * 220,
+		'Outcome': [1] * 40 + [0] * 200
+	})
+	
+	result = yli.regress(yli.PenalisedLogit, df, 'Outcome', 'Pred', exp=False)
+	
+	assert result.dof_model == 1
+	assert result.beta['(Intercept)'].point == approx(-2.280389)
+	assert result.beta['(Intercept)'].ci_lower == approx(-2.765427)
+	assert result.beta['(Intercept)'].ci_upper == approx(-1.851695)
+	assert result.pvalues['(Intercept)'] < 0.0001
+	assert result.beta['Pred'].point == approx(5.993961)
+	assert result.beta['Pred'].ci_lower == approx(3.947048)
+	assert result.beta['Pred'].ci_upper == approx(10.852893)
+	assert result.pvalues['Pred'] < 0.0001
+	
+	lrtest_result = result.lrtest_null()
+	assert lrtest_result.statistic == approx(78.95473)
+	assert lrtest_result.dof == 1
+	assert lrtest_result.pvalue < 0.0001
--- a/yli/init.py
+++ b/yli/init.py
@ -16,7 +16,7 @@

 from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
 from .fs import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
-from .regress import regress, vif
+from .regress import PenalisedLogit, regress, vif
 from .sig_tests import chi2, mannwhitney, ttest_ind

 def reload_me():
--- a/yli/regress.py
+++ b/yli/regress.py
@ -18,6 +18,7 @@ import numpy as np
 import pandas as pd
 import patsy
 from scipy import stats
+import statsmodels
 import statsmodels.api as sm
 from statsmodels.iolib.table import SimpleTable
 from statsmodels.stats.outliers_influence import variance_inflation_factor
@ -286,7 +287,7 @@ def regress(
 	
 	# Autodetect whether to exponentiate
 	if exp is None:
-		if model_class is sm.Logit:
+		if model_class is sm.Logit or model_class is PenalisedLogit:
 			exp = True
 		else:
 			exp = False
@ -308,6 +309,11 @@ def regress(
 	model = model_class.from_formula(formula=dep + ' ~ ' + formula, data=df)
 	result = model.fit()
 	
+	if isinstance(result, RegressionResult):
+		# Already processed!
+		result.exp = exp
+		return result
+	
 	confint = result.conf_int()
 	beta = {t: Estimate(b, confint[0][t], confint[1][t]) for t, b in result.params.items()}
 	
@ -331,3 +337,52 @@ def regress(
 		getattr(result, 'df_resid', None), getattr(result, 'rsquared', None), getattr(result, 'fvalue', None),
 		exp
 	)
+
+# -----------------------------
+# Penalised logistic regression
+
+class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel):
+	"""
+	Statsmodel-compatible model for computing Firth penalised logistic regression
+	Uses R "logistf" library
+	
+	NB: This class expects to be used in the context of yli.regress()
+	"""
+	
+	def fit(self):
+		import rpy2.robjects as ro
+		import rpy2.robjects.packages
+		import rpy2.robjects.pandas2ri
+		
+		# Assume data is already cleaned from regress()
+		df = self.data.frame.copy()
+		
+		# Convert bool to int otherwise rpy2 chokes
+		df = df.replace({False: 0, True: 1})
+		
+		# Import logistf
+		ro.packages.importr('logistf')
+		
+		with ro.conversion.localconverter(ro.default_converter + ro.pandas2ri.converter):
+			with ro.local_context() as lc:
+				# Convert DataFrame to R
+				lc['df'] = df
+				
+				# Transfer other parameters to R
+				lc['formula_'] = self.formula
+				
+				# Fit the model
+				model = ro.r('logistf(formula_, data=df)')
+				
+				beta = {t: Estimate(b, ci0, ci1) for t, b, ci0, ci1 in zip(model['terms'], model['coefficients'], model['ci.lower'], model['ci.upper'])}
+				pvalues = {t: p for t, p in zip(model['terms'], model['prob'])}
+				
+				return RegressionResult(
+					model,
+					'Penalised Logistic Regression', 'Logit', 'Penalised ML',
+					self.endog_names, model['n'][0], model['df'][0], datetime.now(),
+					beta, pvalues,
+					model['loglik'][0], model['loglik'][1],
+					None, None, None,
+					None  # Set exp in regress()
+				)