From 461e00df786c6adde10792a7e5250c252b811ce3 Mon Sep 17 00:00:00 2001 From: RunasSudo Date: Thu, 13 Oct 2022 17:23:29 +1100 Subject: [PATCH] Implement PenalisedLogit --- tests/test_regress.py | 25 +++++++++++++++++++ yli/__init__.py | 2 +- yli/regress.py | 57 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 82 insertions(+), 2 deletions(-) diff --git a/tests/test_regress.py b/tests/test_regress.py index 1345b1c..89ba21b 100644 --- a/tests/test_regress.py +++ b/tests/test_regress.py @@ -121,3 +121,28 @@ def test_regress_logit_ol12_23(): assert expbeta_gam.point == approx(1.169, abs=0.001) assert expbeta_gam.ci_lower == approx(0.924, abs=0.001) assert expbeta_gam.ci_upper == approx(1.477, abs=0.001) + +def test_regress_penalisedlogit_kleinman(): + """Compare yli.regress with yli.PenalisedLogit for http://sas-and-r.blogspot.com/2010/11/example-815-firth-logistic-regression.html""" + + df = pd.DataFrame({ + 'Pred': [1] * 20 + [0] * 220, + 'Outcome': [1] * 40 + [0] * 200 + }) + + result = yli.regress(yli.PenalisedLogit, df, 'Outcome', 'Pred', exp=False) + + assert result.dof_model == 1 + assert result.beta['(Intercept)'].point == approx(-2.280389) + assert result.beta['(Intercept)'].ci_lower == approx(-2.765427) + assert result.beta['(Intercept)'].ci_upper == approx(-1.851695) + assert result.pvalues['(Intercept)'] < 0.0001 + assert result.beta['Pred'].point == approx(5.993961) + assert result.beta['Pred'].ci_lower == approx(3.947048) + assert result.beta['Pred'].ci_upper == approx(10.852893) + assert result.pvalues['Pred'] < 0.0001 + + lrtest_result = result.lrtest_null() + assert lrtest_result.statistic == approx(78.95473) + assert lrtest_result.dof == 1 + assert lrtest_result.pvalue < 0.0001 diff --git a/yli/__init__.py b/yli/__init__.py index 8b97b9c..783c431 100644 --- a/yli/__init__.py +++ b/yli/__init__.py @@ -16,7 +16,7 @@ from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist from .fs import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted -from .regress import regress, vif +from .regress import PenalisedLogit, regress, vif from .sig_tests import chi2, mannwhitney, ttest_ind def reload_me(): diff --git a/yli/regress.py b/yli/regress.py index 051a6ec..5708cc7 100644 --- a/yli/regress.py +++ b/yli/regress.py @@ -18,6 +18,7 @@ import numpy as np import pandas as pd import patsy from scipy import stats +import statsmodels import statsmodels.api as sm from statsmodels.iolib.table import SimpleTable from statsmodels.stats.outliers_influence import variance_inflation_factor @@ -286,7 +287,7 @@ def regress( # Autodetect whether to exponentiate if exp is None: - if model_class is sm.Logit: + if model_class is sm.Logit or model_class is PenalisedLogit: exp = True else: exp = False @@ -308,6 +309,11 @@ def regress( model = model_class.from_formula(formula=dep + ' ~ ' + formula, data=df) result = model.fit() + if isinstance(result, RegressionResult): + # Already processed! + result.exp = exp + return result + confint = result.conf_int() beta = {t: Estimate(b, confint[0][t], confint[1][t]) for t, b in result.params.items()} @@ -331,3 +337,52 @@ def regress( getattr(result, 'df_resid', None), getattr(result, 'rsquared', None), getattr(result, 'fvalue', None), exp ) + +# ----------------------------- +# Penalised logistic regression + +class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel): + """ + Statsmodel-compatible model for computing Firth penalised logistic regression + Uses R "logistf" library + + NB: This class expects to be used in the context of yli.regress() + """ + + def fit(self): + import rpy2.robjects as ro + import rpy2.robjects.packages + import rpy2.robjects.pandas2ri + + # Assume data is already cleaned from regress() + df = self.data.frame.copy() + + # Convert bool to int otherwise rpy2 chokes + df = df.replace({False: 0, True: 1}) + + # Import logistf + ro.packages.importr('logistf') + + with ro.conversion.localconverter(ro.default_converter + ro.pandas2ri.converter): + with ro.local_context() as lc: + # Convert DataFrame to R + lc['df'] = df + + # Transfer other parameters to R + lc['formula_'] = self.formula + + # Fit the model + model = ro.r('logistf(formula_, data=df)') + + beta = {t: Estimate(b, ci0, ci1) for t, b, ci0, ci1 in zip(model['terms'], model['coefficients'], model['ci.lower'], model['ci.upper'])} + pvalues = {t: p for t, p in zip(model['terms'], model['prob'])} + + return RegressionResult( + model, + 'Penalised Logistic Regression', 'Logit', 'Penalised ML', + self.endog_names, model['n'][0], model['df'][0], datetime.now(), + beta, pvalues, + model['loglik'][0], model['loglik'][1], + None, None, None, + None # Set exp in regress() + )