diff --git a/tests/test_anova.py b/tests/test_anova.py
index 441c59e..4d188c4 100644
--- a/tests/test_anova.py
+++ b/tests/test_anova.py
@@ -1,5 +1,5 @@
# scipy-yli: Helpful SciPy utilities and recipes
-# Copyright © 2022 Lee Yingtong Li (RunasSudo)
+# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
@@ -17,7 +17,6 @@
from pytest import approx
import pandas as pd
-import statsmodels.api as sm
import yli
@@ -44,7 +43,7 @@ def test_regress_ftest_ol8_2():
'Score': [96, 79, 91, 85, 83, 91, 82, 87, 77, 76, 74, 73, 78, 71, 80, 66, 73, 69, 66, 77, 73, 71, 70, 74]
})
- result = yli.regress(sm.OLS, df, 'Score', 'C(Method)').ftest()
+ result = yli.regress(yli.OLS, df, 'Score', 'C(Method)').ftest()
assert result.statistic == approx(545.316/18.4366, rel=0.001)
assert result.dof1 == 2
diff --git a/tests/test_bayes_factors.py b/tests/test_bayes_factors.py
index 26582d0..f7f5758 100644
--- a/tests/test_bayes_factors.py
+++ b/tests/test_bayes_factors.py
@@ -1,5 +1,5 @@
# scipy-yli: Helpful SciPy utilities and recipes
-# Copyright © 2022 Lee Yingtong Li (RunasSudo)
+# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
@@ -17,7 +17,6 @@
from pytest import approx
import pandas as pd
-import statsmodels.api as sm
import yli
@@ -30,7 +29,7 @@ def test_afbf_logit_beta_zero():
'GammaGlobulin': [38, 36, 36, 36, 30, 31, 36, 37, 31, 38, 29, 46, 46, 31, 35, 37, 33, 37, 37, 34, 44, 28, 31, 39, 37, 39, 32, 38, 36, 32, 41, 30]
})
- result = yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
+ result = yli.regress(yli.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
# model <- glm(Unhealthy ~ Fibrinogen + GammaGlobulin, data=df, family=binomial())
# bf_fit <- BF(model, hypothesis="Fibrinogen = 0")
diff --git a/tests/test_chi2.py b/tests/test_chi2.py
index 0685fc7..579e537 100644
--- a/tests/test_chi2.py
+++ b/tests/test_chi2.py
@@ -1,5 +1,5 @@
# scipy-yli: Helpful SciPy utilities and recipes
-# Copyright © 2022 Lee Yingtong Li (RunasSudo)
+# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
@@ -68,10 +68,10 @@ def test_chi2_ol10_18():
assert result.oddsratio.ci_lower == approx(1.113, abs=0.001)
assert result.oddsratio.ci_upper == approx(1.596, abs=0.001)
- expected_summary = '''Stress False True
-Response
-False 250 400
-True 750 1600
+ expected_summary = '''Stress False True
+Response
+False 250 400
+True 750 1600
χ²(1) = 9.82; p = 0.002*
OR (95% CI) = 1.33 (1.11–1.60)
diff --git a/tests/test_ordinallogit.py b/tests/test_ordinallogit.py
index 67f0831..b2ab173 100644
--- a/tests/test_ordinallogit.py
+++ b/tests/test_ordinallogit.py
@@ -41,16 +41,16 @@ def test_ordinallogit_ucla():
assert result.terms['(Cutoffs)'].categories['somewhat likely/very likely'].beta.ci_lower == approx(2.72234, abs=0.001)
assert result.terms['(Cutoffs)'].categories['somewhat likely/very likely'].beta.ci_upper == approx(5.875195, abs=0.001)
- expected_summary = ''' Ordinal Logistic Regression Results
-===============================================================
-Dep. Variable: apply | No. Observations: 400
- Model: OrdinalLogit | Df. Model: 5
- Method: Maximum Likelihood | Df. Residuals: 395
- Date: {0:%Y-%m-%d} | Pseudo R²: 0.03
- Time: {0:%H:%M:%S} | LL-Model: -358.51
- Std. Errors: Non-Robust | LL-Null: -370.60
- | p (LR): <0.001*
-===============================================================
+ expected_summary = ''' Ordinal Logistic Regression Results
+==========================================================
+Dep. Variable: apply | No. Observations: 400
+ Model: Ordinal Logit | Df. Model: 5
+ Date: {0:%Y-%m-%d} | Df. Residuals: 395
+ Time: {0:%H:%M:%S} | Pseudo R²: 0.03
+ Std. Errors: Non-Robust | LL-Model: -358.51
+ | LL-Null: -370.60
+ | p (LR): <0.001*
+============================================================
β (95% CI) p
------------------------------------------------------------
pared 1.05 (0.53 - 1.57) <0.001*
diff --git a/tests/test_regress.py b/tests/test_regress.py
index eccd8c5..a9777bf 100644
--- a/tests/test_regress.py
+++ b/tests/test_regress.py
@@ -1,5 +1,5 @@
# scipy-yli: Helpful SciPy utilities and recipes
-# Copyright © 2022 Lee Yingtong Li (RunasSudo)
+# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
@@ -14,11 +14,11 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see .
+import pytest
from pytest import approx
import numpy as np
import pandas as pd
-import statsmodels.api as sm
import yli
from yli.regress import CategoricalTerm
@@ -31,7 +31,7 @@ def test_regress_ols_ol11_4():
'GrowthRet': [17.78, 21.59, 23.84, 15.13, 23.45, 20.87, 17.78, 20.09, 17.78, 12.46, 14.95, 15.87, 17.45, 14.35, 14.64, 17.25, 12.57, 7.15, 7.50, 4.34]
})
- result = yli.regress(sm.OLS, df, 'GrowthRet', 'SoilPh')
+ result = yli.regress(yli.OLS, df, 'GrowthRet', 'SoilPh')
assert result.dof_model == 1
assert result.dof_resid == 18
@@ -46,9 +46,27 @@ def test_regress_ols_ol11_4():
assert result.terms['SoilPh'].beta.ci_lower == approx(-10.15, abs=0.01)
assert result.terms['SoilPh'].beta.ci_upper == approx(-5.57, abs=0.01)
+
+ expected_summary = ''' Ordinary Least Squares Regression Results
+=======================================================
+Dep. Variable: GrowthRet | No. Observations: 20
+ Model: OLS | Df. Model: 1
+ Date: {0:%Y-%m-%d} | Df. Residuals: 18
+ Time: {0:%H:%M:%S} | R²: 0.74
+ Std. Errors: Non-Robust | F: 52.01
+ | p (F): <0.001*
+=======================================================
+ β (95% CI) p
+----------------------------------------------
+(Intercept) 47.48 (38.17 - 56.78) <0.001*
+ SoilPh -7.86 (-10.15 - -5.57) <0.001*
+----------------------------------------------'''.format(result.fitted_dt)
+
+ assert result.summary() == expected_summary
+@pytest.mark.skip('Not implemented in refactored regression implementation')
def test_regress_bootstrap_ols_ol11_4():
- """Compare RegressionResult.bootstrap for Ott & Longnecker (2016) example 11.4/11.7"""
+ """Compare RegressionModel.bootstrap for Ott & Longnecker (2016) example 11.4/11.7"""
df = pd.DataFrame({
'SoilPh': [3.3, 3.4, 3.4, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 5.0, 5.1, 5.2],
@@ -86,7 +104,7 @@ def test_regress_ols_ol13_5():
})
df['LNC'] = np.log(df['C'])
- result = yli.regress(sm.OLS, df, 'LNC', 'D + T1 + T2 + S + PR + NE + CT + BW + N + PT')
+ result = yli.regress(yli.OLS, df, 'LNC', 'D + T1 + T2 + S + PR + NE + CT + BW + N + PT')
assert result.dof_model == 10
assert result.dof_resid == 21
@@ -126,7 +144,7 @@ def test_regress_logit_ol12_23():
'GammaGlobulin': [38, 36, 36, 36, 30, 31, 36, 37, 31, 38, 29, 46, 46, 31, 35, 37, 33, 37, 37, 34, 44, 28, 31, 39, 37, 39, 32, 38, 36, 32, 41, 30]
})
- result = yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
+ result = yli.regress(yli.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
# Some numerical differences as intercept term is very negative
lrtest_result = result.lrtest_null()
@@ -152,10 +170,10 @@ def test_regress_logit_ol12_23():
======================================================
Dep. Variable: Unhealthy | No. Observations: 32
Model: Logit | Df. Model: 2
- Method: MLE | Df. Residuals: 29
- Date: {0:%Y-%m-%d} | Pseudo R²: 0.26
- Time: {0:%H:%M:%S} | LL-Model: -11.47
- Std. Errors: Non-Robust | LL-Null: -15.44
+ Date: {0:%Y-%m-%d} | Df. Residuals: 29
+ Time: {0:%H:%M:%S} | Pseudo R²: 0.26
+ Std. Errors: Non-Robust | LL-Model: -11.47
+ | LL-Null: -15.44
| p (LR): 0.02*
======================================================
exp(β) (95% CI) p
@@ -182,7 +200,7 @@ def test_regress_logit_ol10_18():
'Stress': np.repeat([d[1] for d in data], [d[2] for d in data])
})
- result = yli.regress(sm.Logit, df, 'Stress', 'Response', bool_baselevels=True)
+ result = yli.regress(yli.Logit, df, 'Stress', 'Response', bool_baselevels=True)
assert isinstance(result.terms['Response'], CategoricalTerm)
assert result.terms['Response'].ref_category == False
@@ -217,15 +235,15 @@ def test_regress_penalisedlogit_kleinman():
assert lrtest_result.dof == 1
assert lrtest_result.pvalue < 0.0001
- expected_summary = ''' Penalised Logistic Regression Results
-=========================================================
-Dep. Variable: Outcome | No. Observations: 240
- Model: Logit | Df. Model: 1
- Method: Penalised ML | Pseudo R²: 0.37
- Date: {0:%Y-%m-%d} | LL-Model: -66.43
- Time: {0:%H:%M:%S} | LL-Null: -105.91
- Std. Errors: Non-Robust | p (LR): <0.001*
-=========================================================
+ expected_summary = ''' Penalised Logistic Regression Results
+============================================================
+Dep. Variable: Outcome | No. Observations: 240
+ Model: Penalised Logit | Df. Model: 1
+ Date: {0:%Y-%m-%d} | Pseudo R²: 0.37
+ Time: {0:%H:%M:%S} | LL-Model: -66.43
+ Std. Errors: Non-Robust | LL-Null: -105.91
+ | p (LR): <0.001*
+============================================================
β (95% CI) p
---------------------------------------------
(Intercept) -2.28 (-2.77 - -1.85) <0.001*
diff --git a/yli/__init__.py b/yli/__init__.py
index c93f752..eedef22 100644
--- a/yli/__init__.py
+++ b/yli/__init__.py
@@ -20,9 +20,9 @@ from .descriptives import auto_correlations, auto_descriptives
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
from .graphs import init_fonts
from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
-from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, vif
+from .regress import Logit, OLS, OrdinalLogit, PenalisedLogit, regress, vif
from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, spearman, ttest_ind
-from .survival import cox_interval_censored, kaplanmeier, logrank, turnbull
+from .survival import kaplanmeier, logrank, turnbull
from .utils import as_ordinal
def reload_me():
diff --git a/yli/regress.py b/yli/regress.py
index 4631ea5..303babd 100644
--- a/yli/regress.py
+++ b/yli/regress.py
@@ -23,11 +23,12 @@ import statsmodels, statsmodels.base.model, statsmodels.miscmodels.ordinal_model
import statsmodels.api as sm
from statsmodels.iolib.table import SimpleTable
from statsmodels.stats.outliers_influence import variance_inflation_factor
-from tqdm import tqdm
from datetime import datetime
+import io
import itertools
-import warnings
+import json
+import subprocess
import weakref
from .bayes_factors import BayesFactor, bayesfactor_afbf
@@ -36,6 +37,9 @@ from .shap import ShapResult
from .sig_tests import ChiSquaredResult, FTestResult
from .utils import Estimate, PValueStyle, as_numeric, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term
+# TODO: Documentation
+# TODO: Bootstrap
+
def vif(df, formula=None, *, nan_policy='warn'):
"""
Calculate the variance inflation factor (VIF) for each variable in *df*
@@ -93,405 +97,176 @@ def vif(df, formula=None, *, nan_policy='warn'):
return pd.Series(vifs)
-# ----------
-# Regression
+def regress(model_class, df, dep, formula, *, nan_policy='warn', bool_baselevels=False, exp=None):
+ """
+ Fit a statsmodels regression model
+
+ :param model_class: Type of regression model to fit
+ :type model_class: :class:`RegressionModel` subclass
+ :param df: Data to perform regression on
+ :type df: DataFrame
+ :param dep: Column in *df* for the dependent variable (numeric)
+ :type dep: str
+ :param formula: Patsy formula for the regression model
+ :type formula: str
+ :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
+ :type nan_policy: str
+ :param bool_baselevels: Show reference categories for boolean independent variables even if reference category is *False*
+ :type bool_baselevels: bool
+ :param exp: Report exponentiated parameters rather than raw parameters, default (*None*) is to autodetect based on *model_class*
+ :type exp: bool
+
+ :rtype: :class:`RegressionModel`
+
+ **Example:**
+
+ .. code-block::
+
+ df = pd.DataFrame({
+ 'Unhealthy': [False, False, False, ...],
+ 'Fibrinogen': [2.52, 2.46, 2.29, ...],
+ 'GammaGlobulin': [38, 36, 36, ...]
+ })
+ yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
+
+ .. code-block:: text
+
+ Logistic Regression Results
+ ======================================================
+ Dep. Variable: Unhealthy | No. Observations: 32
+ Model: Logit | Df. Model: 2
+ Date: 2022-10-18 | Df. Residuals: 29
+ Time: 19:00:34 | Pseudo R²: 0.26
+ Std. Errors: Non-Robust | LL-Model: -11.47
+ | LL-Null: -15.44
+ | p (LR): 0.02*
+ ======================================================
+ exp(β) (95% CI) p
+ -----------------------------------------------
+ (Intercept) 0.00 (0.00 - 0.24) 0.03*
+ Fibrinogen 6.80 (1.01 - 45.79) 0.049*
+ GammaGlobulin 1.17 (0.92 - 1.48) 0.19
+ -----------------------------------------------
+
+ The output summarises the results of the regression.
+ Note that the parameter estimates are automatically exponentiated.
+ For example, the odds ratio for unhealthiness per unit increase in fibrinogen is 6.80, with 95% confidence interval 1.01–45.79, and is significant with *p* value 0.049.
+ """
+
+ if not any(x.__name__ == 'RegressionModel' for x in model_class.__bases__):
+ raise ValueError('model_class must be a RegressionModel')
+
+ df_ref = weakref.ref(df)
+ dmatrices, dep_categories = df_to_dmatrices(df, dep, formula, nan_policy)
+
+ # Fit model
+ result = model_class.fit(dmatrices[0], dmatrices[1])
+ result.df = df_ref
+ result.dep = dep
+ result.formula = formula
+ result.nan_policy = nan_policy
+ if exp is not None:
+ result.exp = exp
+ result.fitted_dt = datetime.now()
+ result.nobs = len(dmatrices[0])
+
+ # Process categorical terms, etc.
+ raw_terms = result.terms
+ result.terms = {}
+
+ for raw_name, raw_term in raw_terms.items():
+ # Rename terms
+ if raw_name == '(Intercept)':
+ # Already processed
+ result.terms[raw_name] = raw_term
+ elif raw_name == '(Cutoffs)':
+ result.terms[raw_name] = raw_term
+
+ if dep_categories is not None:
+ # Need to convert factorised names back into original names
+ old_cutoffs = raw_term.categories
+ raw_term.categories = {}
+
+ for cutoff_raw_name, cutoff_term in old_cutoffs.items():
+ bits = cutoff_raw_name.split('/')
+ term = dep_categories[round(float(bits[0]))] + '/' + dep_categories[round(float(bits[1]))]
+ result.terms[raw_name].categories[term] = cutoff_term
+ else:
+ # Parse if required
+ factor, column, contrast = parse_patsy_term(formula, df, raw_name)
+
+ if contrast is not None:
+ # Categorical term
+
+ if bool_baselevels is False and contrast == 'True' and set(df[column].unique()) == set([True, False]):
+ # Treat as single term
+ result.terms[column] = raw_term
+ else:
+ # Add a new categorical term if not exists
+ if column not in result.terms:
+ ref_category = formula_factor_ref_category(formula, df, factor)
+ result.terms[column] = CategoricalTerm({}, ref_category)
+
+ result.terms[column].categories[contrast] = raw_term
+ else:
+ # Single term
+ result.terms[column] = raw_term
+
+ return result
-class LikelihoodRatioTestResult(ChiSquaredResult):
- """
- Result of a likelihood ratio test for regression
+def df_to_dmatrices(df, dep, formula, nan_policy):
+ # Check for/clean NaNs in input columns
+ columns = [dep] + cols_for_formula(formula, df)
- See :meth:`RegressionResult.lrtest_null`.
- """
+ df = df[columns]
+ df = check_nan(df, nan_policy)
- def __init__(self, statistic, dof, pvalue):
- super().__init__(statistic, dof, pvalue)
+ # Ensure numeric type for dependent variable
+ df[dep], dep_categories = as_numeric(df[dep])
- def _repr_html_(self):
- return 'LR({}) = {:.2f}; p {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
+ # Convert pandas nullable types for independent variables as this breaks statsmodels
+ df = convert_pandas_nullable(df)
- def summary(self):
- """
- Return a stringified summary of the likelihood ratio test
-
- :rtype: str
- """
-
- return 'LR({}) = {:.2f}; p {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION))
+ # Construct design matrix from formula
+ dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe')
+
+ return dmatrices, dep_categories
-class RegressionResult:
- """
- Result of a regression
-
- See :func:`yli.regress`.
- """
-
- def __init__(self,
- model_class, df, dep, formula, nan_policy, model_kwargs, fit_kwargs,
- raw_result,
- full_name, model_name, fit_method,
- nobs, nevents, dof_model, fitted_dt, cov_type,
- terms,
- ll_model, ll_null,
- dof_resid, rsquared, f_statistic,
- comments,
- exp
- ):
- # Data about how model fitted
- #: See :func:`yli.regress`
- self.model_class = model_class
- #: Data fitted (*weakref* to *DataFrame*)
- self.df = df
- #: See :func:`yli.regress`
- self.dep = dep
- #: See :func:`yli.regress`
- self.formula = formula
- #: See :func:`yli.regress`
- self.nan_policy = nan_policy
- #: See :func:`yli.regress`
- self.model_kwargs = model_kwargs
- #: See :func:`yli.regress`
- self.fit_kwargs = fit_kwargs
+class RegressionModel:
+ def __init__(self):
+ # Model configuration (all set in yli.regress)
+ self.df = None
+ self.dep = None
+ self.formula = None
+ self.nan_policy = None
+ self.exp = False
+ self.cov_type = None
- #: Raw result from statsmodels *model.fit*
- self.raw_result = raw_result
+ # Summary information
+ self.fitted_dt = None # Set in yli.regress
+ self.nobs = None # Set in yli.regress
+ self.nevents = None
+ self.dof_model = None
+ self.dof_resid = None
+ self.rsquared = None
+ self.ll_model = None
+ self.ll_null = None
+ self.f_statistic = None
- # Information for display
- #: Full name of the regression model type (*str*)
- self.full_name = full_name
- #: Short name of the regression model type (*str*)
- self.model_name = model_name
- #: Method for fitting the regression model (*str*)
- self.fit_method = fit_method
+ # Parameters
+ self.terms = None
+ self.vcov = None
- # Basic fitted model information
- #: Number of observations (*int*)
- self.nobs = nobs
- #: Number of events (*int*, time-to-event models only)
- self.nevents = nevents
- #: Degrees of freedom for the model (*int*)
- self.dof_model = dof_model
- #: Date and time of fitting the model (Python *datetime*)
- self.fitted_dt = fitted_dt
- #: Method for computing the covariance matrix (*str*)
- self.cov_type = cov_type
-
- # Regression coefficients/p values
- #: Coefficients and *p* values for each term in the model (*dict* of :class:`SingleTerm` or :class:`CategoricalTerm`)
- self.terms = terms
-
- # Model log-likelihood
- #: Log-likelihood of fitted model (*float*)
- self.ll_model = ll_model
- #: Log-likelihood of null model (*float*)
- self.ll_null = ll_null
-
- # Extra statistics (not all regression models have these)
- #: Degrees of freedom for the residuals (*int*; *None* if N/A)
- self.dof_resid = dof_resid
- #: *R*:sup:`2` statistic (*float*; *None* if N/A)
- self.rsquared = rsquared
- #: *F* statistic (*float*; *None* if N/A)
- self.f_statistic = f_statistic
-
- #: Comments for the model (*List[str]*)
- self.comments = comments or []
-
- # Config for display style
- #: See :func:`yli.regress`
- self.exp = exp
+ # Comments
+ self.comments = []
@property
- def pseudo_rsquared(self):
- """McFadden's pseudo *R*:sup:`2` statistic"""
-
- return 1 - self.ll_model/self.ll_null
+ def model_long_name(self):
+ return None
- def lrtest_null(self):
- """
- Compute the likelihood ratio test comparing the model to the null model
-
- :rtype: :class:`LikelihoodRatioTestResult`
- """
-
- statistic = -2 * (self.ll_null - self.ll_model)
- pvalue = 1 - stats.chi2.cdf(statistic, self.dof_model)
-
- return LikelihoodRatioTestResult(statistic, self.dof_model, pvalue)
-
- def ftest(self):
- """
- Perform the *F* test that all slopes are 0
-
- :rtype: :class:`yli.sig_tests.FTestResult`
- """
-
- pvalue = 1 - stats.f(self.dof_model, self.dof_resid).cdf(self.f_statistic)
- return FTestResult(self.f_statistic, self.dof_model, self.dof_resid, pvalue)
-
- def bayesfactor_beta_zero(self, term):
- """
- Compute a Bayes factor testing the hypothesis that the given beta is 0
-
- Uses the R *BFpack* library.
-
- Requires the regression to be from statsmodels.
- The term must be specified as the *raw name* from the statsmodels regression, available via :attr:`RegressionResult.raw_result`.
-
- :param term: Raw name of the term to be tested
- :type term: str
-
- :rtype: :class:`yli.bayes_factors.BayesFactor`
- """
-
- # FIXME: Allow specifying our renamed terms
-
- # Get parameters required for AFBF
- params = pd.Series({raw_name.replace('[', '_').replace(']', '_'): beta for raw_name, beta in self.raw_result.params.items()})
- cov = self.raw_result.cov_params()
-
- # Compute BF matrix
- bf01 = bayesfactor_afbf(params, cov, self.nobs, '{} = 0'.format(term.replace('[', '_').replace(']', '_')))
- bf01 = BayesFactor(bf01.factor, '0', '{} = 0'.format(term), '1', '{} ≠ 0'.format(term))
-
- if bf01.factor >= 1:
- return bf01
- else:
- return bf01.invert()
-
- def brant(self):
- """
- Perform the Brant test for the parallel regression assumption in ordinal regression
-
- Applicable when the model was fitted using :class:`OrdinalLogit`.
-
- :rtype: :class:`BrantResult`
-
- **Example:**
-
- .. code-block::
-
- df = pd.DataFrame(...)
- model = yli.regress(yli.OrdinalLogit, df, 'apply', 'pared + public + gpa', exp=False)
- model.brant()
-
- .. code-block:: text
-
- χ² df p
- Omnibus 4.34 3 0.23
- pared 0.13 1 0.72
- public 3.44 1 0.06
- gpa 0.18 1 0.67
-
- The output shows the result of the Brant test. For example, for the omnibus test of the parallel regression assumption across all independent variables, the *χ*:sup:`2` statistic is 4.34, the *χ*:sup:`2` distribution has 3 degrees of freedom, and the test is not significant, with *p* value 0.23.
-
- **Reference:** Brant R. Assessing proportionality in the proportional odds model for ordinal logistic regression. *Biometrics*. 1990;46(4):1171–8. `doi:10.2307/2532457 `_
- """
-
- df = self.df()
- if df is None:
- raise Exception('Referenced DataFrame has been dropped')
- dep = self.dep
-
- # Check for/clean NaNs
- # NaN warning/error will already have been handled in regress, so here we pass nan_policy='omit'
- # Following this, we pass nan_policy='raise' to assert no NaNs remaining
- df = df[[dep] + cols_for_formula(self.formula, df)]
- df = check_nan(df, 'omit')
-
- # Ensure numeric type for dependent variable
- df[dep], dep_categories = as_numeric(df[dep])
-
- # Convert pandas nullable types for independent variables as this breaks statsmodels
- df = convert_pandas_nullable(df)
-
- # Precompute design matrix for RHS
- # This is also X+ in Brant paper
- dmatrix_right = patsy.dmatrix(self.formula, df, return_type='dataframe')
- dmatrix_right.reset_index(drop=True, inplace=True) # Otherwise this confuses matrix multiplication
-
- # Fit individual logistic regressions
- logit_models = []
- for upper_limit in sorted(df[dep].unique())[:-1]:
- dep_dichotomous = (df[dep] <= upper_limit).astype(int).reset_index(drop=True)
- logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False, **self.fit_kwargs)
-
- if not logit_result.mle_retvals['converged']:
- raise Exception('Maximum likelihood estimation failed to converge for {} <= {}. Check raw_result.mle_retvals.'.format(dep, upper_limit))
-
- if pd.isna(logit_result.bse).any():
- raise Exception('Regression returned NaN standard errors for {} <= {}.'.format(dep, upper_limit))
-
- logit_models.append(logit_result)
-
- logit_betas = np.array([model._results.params for model in logit_models]).T
- logit_pihat = np.array([expit(-model.fittedvalues) for model in logit_models]).T # Predicted probabilities
-
- # vcov is the variance-covariance matrix of all individually fitted betas across all terms
-
- # | model 1 | model 2 | model 3 | ...
- # | term 1 | term 2 | term 1 | term 2 | term 1 | term 2 | ...
- # model 1 | term 1 |
- # | term 2 |
- # model 2 | term 1 |
- # | term 2 |
- # ...
-
- n_terms = len(dmatrix_right.columns) - 1 # number of beta terms (excluding intercept)
- n_betas = len(logit_models) * n_terms
- vcov = np.zeros((n_betas, n_betas))
-
- # Populate the variance-covariance matrix for comparisons between individually fitted models
- for j in range(0, len(logit_models) - 1):
- for l in range(j + 1, len(logit_models)):
- Wjj = np.diag(logit_pihat[:,j] - logit_pihat[:,j] * logit_pihat[:,j])
- Wjl = np.diag(logit_pihat[:,l] - logit_pihat[:,j] * logit_pihat[:,l])
- Wll = np.diag(logit_pihat[:,l] - logit_pihat[:,l] * logit_pihat[:,l])
-
- matrix_result = np.linalg.inv(dmatrix_right.T @ Wjj @ dmatrix_right) @ dmatrix_right.T @ Wjl @ dmatrix_right @ np.linalg.inv(dmatrix_right.T @ Wll @ dmatrix_right)
- j_vs_l_vcov = np.asarray(matrix_result)[1:,1:] # Asymptotic covariance for j,l
-
- vcov[j*n_terms:(j+1)*n_terms, l*n_terms:(l+1)*n_terms] = j_vs_l_vcov
- vcov[l*n_terms:(l+1)*n_terms, j*n_terms:(j+1)*n_terms] = j_vs_l_vcov
-
- # Populate the variance-covariance matrix within each individually fitted model
- for i in range(len(logit_models)):
- vcov[i*n_terms:(i+1)*n_terms, i*n_terms:(i+1)*n_terms] = logit_models[i]._results.cov_params()[1:,1:]
-
- # ------------------
- # Perform Wald tests
-
- beta_names = ['{}_{}'.format(raw_name, i) for i in range(len(logit_models)) for raw_name in dmatrix_right.columns[1:]]
- wald_results = {}
-
- # Omnibus test
- constraints = [' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models))) for raw_name in dmatrix_right.columns[1:]]
- constraint = ', '.join(constraints)
- df = (len(logit_models) - 1) * (len(dmatrix_right.columns) - 1) # df = (number of levels minus 2) * (number of terms excluding intercept)
- wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, df)
- wald_results['Omnibus'] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue)
-
- # Individual terms
- for raw_name in dmatrix_right.columns[1:]:
- constraint = ' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models)))
- df = len(logit_models) - 1 # df = (number of levels minus 2)
- wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, df)
- wald_results[raw_name] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue)
-
- return BrantResult(wald_results)
-
- def bootstrap(self, samples=1000):
- """
- Use bootstrapping to recompute confidence intervals and *p* values for the terms in the regression model
-
- :param samples: Number of bootstrap samples to draw
- :type samples: int
-
- :rtype: :class:`yli.regress.RegressionResult`
- """
-
- df = self.df()
- if df is None:
- raise Exception('Referenced DataFrame has been dropped')
- dep = self.dep
-
- # Check for/clean NaNs
- # NaN warning/error will already have been handled in regress, so here we pass nan_policy='omit'
- # Following this, we pass nan_policy='raise' to assert no NaNs remaining
- df = df[[dep] + cols_for_formula(self.formula, df)]
- df = check_nan(df, 'omit')
-
- # Ensure numeric type for dependent variable
- df[dep], dep_categories = as_numeric(df[dep])
-
- # Convert pandas nullable types for independent variables as this breaks statsmodels
- df = convert_pandas_nullable(df)
-
- # Precompute design matrices
- dmatrices = patsy.dmatrices(dep + ' ~ ' + self.formula, df, return_type='dataframe')
-
- # Fit full model
- #full_model = regress(self.model_class, df, dep, self.formula, nan_policy='raise', _dmatrices=dmatrices, model_kwargs=self.model_kwargs, fit_kwargs=self.fit_kwargs)
-
- # Initialise bootstrap_results
- bootstrap_results = {} # Dict mapping term raw names to bootstrap betas
- for term in self.terms.values():
- if isinstance(term, SingleTerm):
- bootstrap_results[term.raw_name] = []
- else:
- for sub_term in term.categories.values():
- bootstrap_results[sub_term.raw_name] = []
-
- # Draw bootstrap samples and regress
- dmatrices = dmatrices[0].join(dmatrices[1])
-
- for i in tqdm(range(samples)):
- bootstrap_rows = dmatrices.sample(len(df), replace=True)
- model = self.model_class(endog=bootstrap_rows.iloc[:,0], exog=bootstrap_rows.iloc[:,1:], **self.model_kwargs)
- model.formula = dep + ' ~ ' + self.formula
-
- result = model.fit(disp=False, **self.fit_kwargs)
-
- for raw_name, raw_beta in zip(model.exog_names, result._results.params):
- bootstrap_results[raw_name].append(raw_beta)
-
- # Combine bootstrap results
- terms = {}
- for term_name, term in self.terms.items():
- if isinstance(term, SingleTerm):
- bootstrap_betas = bootstrap_results[term.raw_name]
- bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
- bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
- terms[term_name] = SingleTerm(term.raw_name, Estimate(term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue)
- else:
- categories = {}
- for sub_term_name, sub_term in term.categories.items():
- bootstrap_betas = bootstrap_results[sub_term.raw_name]
- bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
- bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
- categories[sub_term_name] = SingleTerm(sub_term.raw_name, Estimate(sub_term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue)
- terms[term_name] = CategoricalTerm(categories, term.ref_category)
-
- return RegressionResult(
- self.model_class, self.df, dep, self.formula, self.nan_policy, self.model_kwargs, self.fit_kwargs,
- None,
- self.full_name, self.model_name, self.fit_method,
- self.nobs, None, self.dof_model, datetime.now(), 'Bootstrap',
- terms,
- self.ll_model, self.ll_null,
- self.dof_resid, self.rsquared, self.f_statistic,
- self.comments,
- self.exp
- )
-
- def shap(self, **kwargs):
- """
- Compute SHAP values for the model
-
- Uses the Python *shap* library.
-
- :param kwargs: Keyword arguments to pass to *shap.LinearExplainer*
-
- :rtype: :class:`yli.shap.ShapResult`
-
- **Reference:** Lundberg SM, Lee SI. A unified approach to interpreting model predictions. In: Guyon I, Von Luxburg U, Bengio S, et al., editors. *Advances in Neural Information Processing Systems*; 2017 Dec 4–9; Long Beach, CA. https://proceedings.neurips.cc/paper/2017/hash/8a20a8621978632d76c43dfd28b67767-Abstract.html
- """
-
- import shap
-
- xdata = ShapResult._get_xdata(self)
-
- # Combine terms into single list
- params = []
- for term in self.terms.values():
- if isinstance(term, SingleTerm):
- params.append(term.beta.point)
- else:
- params.extend(s.beta.point for s in term.categories.values())
-
- explainer = shap.LinearExplainer((np.array(params[1:]), params[0]), xdata, **kwargs) # FIXME: Assumes zeroth term is intercept
- shap_values = explainer.shap_values(xdata).astype('float')
-
- return ShapResult(weakref.ref(self), shap_values, list(xdata.columns))
+ @property
+ def model_short_name(self):
+ return None
def _header_table(self, html):
"""Return the entries for the header table"""
@@ -500,8 +275,7 @@ class RegressionResult:
left_col = []
left_col.append(('Dep. Variable:', self.dep))
- left_col.append(('Model:', self.model_name))
- left_col.append(('Method:', self.fit_method))
+ left_col.append(('Model:', self.model_short_name))
left_col.append(('Date:', self.fitted_dt.strftime('%Y-%m-%d')))
left_col.append(('Time:', self.fitted_dt.strftime('%H:%M:%S')))
if self.cov_type:
@@ -555,7 +329,7 @@ class RegressionResult:
# Render header table
left_col, right_col = self._header_table(html=True)
- out = '{} Results'.format(self.full_name)
+ out = '{} Results'.format(self.model_long_name)
for left_cell, right_cell in itertools.zip_longest(left_col, right_col):
out += '{} | {} | {} | {} |
'.format(
left_cell[0] if left_cell else '',
@@ -625,7 +399,7 @@ class RegressionResult:
elif len(left_col) > len(right_col):
right_col.extend([('', '')] * (len(left_col) - len(right_col)))
- table1 = SimpleTable(np.concatenate([left_col, right_col], axis=1), title='{} Results'.format(self.full_name))
+ table1 = SimpleTable(np.concatenate([left_col, right_col], axis=1), title='{} Results'.format(self.model_long_name))
table1.insert_stubs(2, [' | '] * len(left_col))
# Get rid of last line (merge with next table)
@@ -681,12 +455,130 @@ class RegressionResult:
out += '\n{}. {}'.format(i + 1, comment)
return out
+
+ # --------------------
+ # Post hoc tests, etc.
+
+ def bayesfactor_beta_zero(self, term):
+ """
+ Compute a Bayes factor testing the hypothesis that the given beta is 0
+
+ Uses the R *BFpack* library.
+
+ Requires the regression to be from statsmodels.
+ The term must be specified as the *raw name* from the statsmodels regression, available via :attr:`RegressionResult.raw_result`.
+
+ :param term: Raw name of the term to be tested
+ :type term: str
+
+ :rtype: :class:`yli.bayes_factors.BayesFactor`
+ """
+
+ # FIXME: Allow specifying our renamed terms
+
+ # Get parameters required for AFBF
+ raw_params = {}
+ for t in self.terms.values():
+ if isinstance(t, CategoricalTerm):
+ for t in t.categories.values():
+ raw_params[t.raw_name.replace('[', '_').replace(']', '_')] = t.beta.point
+ else:
+ raw_params[t.raw_name.replace('[', '_').replace(']', '_')] = t.beta.point
+
+ # Compute BF matrix
+ bf01 = bayesfactor_afbf(pd.Series(raw_params), self.vcov, self.nobs, '{} = 0'.format(term.replace('[', '_').replace(']', '_')))
+ bf01 = BayesFactor(bf01.factor, '0', '{} = 0'.format(term), '1', '{} ≠ 0'.format(term))
+
+ if bf01.factor >= 1:
+ return bf01
+ else:
+ return bf01.invert()
+
+ def ftest(self):
+ """
+ Perform the *F* test that all slopes are 0
+
+ :rtype: :class:`yli.sig_tests.FTestResult`
+ """
+
+ pvalue = 1 - stats.f(self.dof_model, self.dof_resid).cdf(self.f_statistic)
+ return FTestResult(self.f_statistic, self.dof_model, self.dof_resid, pvalue)
+
+ def lrtest_null(self):
+ """
+ Compute the likelihood ratio test comparing the model to the null model
+
+ :rtype: :class:`LikelihoodRatioTestResult`
+ """
+
+ statistic = -2 * (self.ll_null - self.ll_model)
+ pvalue = 1 - stats.chi2.cdf(statistic, self.dof_model)
+ LikelihoodRatioTestResult
+ return LikelihoodRatioTestResult(statistic, self.dof_model, pvalue)
+
+ @property
+ def pseudo_rsquared(self):
+ """McFadden's pseudo *R*:sup:`2` statistic"""
+
+ return 1 - self.ll_model/self.ll_null
+
+ def shap(self, **kwargs):
+ """
+ Compute SHAP values for the model
+
+ Uses the Python *shap* library.
+
+ :param kwargs: Keyword arguments to pass to *shap.LinearExplainer*
+
+ :rtype: :class:`yli.shap.ShapResult`
+
+ **Reference:** Lundberg SM, Lee SI. A unified approach to interpreting model predictions. In: Guyon I, Von Luxburg U, Bengio S, et al., editors. *Advances in Neural Information Processing Systems*; 2017 Dec 4–9; Long Beach, CA. https://proceedings.neurips.cc/paper/2017/hash/8a20a8621978632d76c43dfd28b67767-Abstract.html
+ """
+
+ import shap
+
+ xdata = ShapResult._get_xdata(self)
+
+ # Combine terms into single list
+ params = []
+ for term in self.terms.values():
+ if isinstance(term, SingleTerm):
+ params.append(term.beta.point)
+ else:
+ params.extend(s.beta.point for s in term.categories.values())
+
+ explainer = shap.LinearExplainer((np.array(params[1:]), params[0]), xdata, **kwargs) # FIXME: Assumes zeroth term is intercept
+ shap_values = explainer.shap_values(xdata).astype('float')
+
+ return ShapResult(weakref.ref(self), shap_values, list(xdata.columns))
+
+class LikelihoodRatioTestResult(ChiSquaredResult):
+ """
+ Result of a likelihood ratio test for regression
+
+ See :meth:`RegressionResult.lrtest_null`.
+ """
+
+ def __init__(self, statistic, dof, pvalue):
+ super().__init__(statistic, dof, pvalue)
+
+ def _repr_html_(self):
+ return 'LR({}) = {:.2f}; p {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML))
+
+ def summary(self):
+ """
+ Return a stringified summary of the likelihood ratio test
+
+ :rtype: str
+ """
+
+ return 'LR({}) = {:.2f}; p {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION))
class SingleTerm:
- """A term in a :class:`RegressionResult` which is a single term"""
+ """A term in a :class:`RegressionModel` which is a single term"""
def __init__(self, raw_name, beta, pvalue):
- #: Raw name of the term (*str*; e.g. in :attr:`RegressionResult.raw_result`)
+ #: Raw name of the term (*str*; e.g. in :attr:`RegressionModel.raw_result`)
self.raw_name = raw_name
#: :class:`yli.utils.Estimate` of the coefficient
self.beta = beta
@@ -694,7 +586,7 @@ class SingleTerm:
self.pvalue = pvalue
class CategoricalTerm:
- """A group of terms in a :class:`RegressionResult` corresponding to a categorical variable"""
+ """A group of terms in a :class:`RegressionModel` corresponding to a categorical variable"""
def __init__(self, categories, ref_category):
#: Terms for each of the categories, excluding the reference category (*dict* of :class:`SingleTerm`)
@@ -702,463 +594,80 @@ class CategoricalTerm:
#: Name of the reference category (*str*)
self.ref_category = ref_category
-def regress(
- model_class, df, dep, formula, *,
- nan_policy='warn',
- model_kwargs=None, fit_kwargs=None,
- family=None, exposure=None, status=None, # common model_kwargs
- cov_type=None, method=None, maxiter=None, start_params=None, # common fit_kwargs
- bool_baselevels=False, exp=None,
- _dmatrices=None,
-):
- """
- Fit a statsmodels regression model
+def raw_terms_from_statsmodels_result(raw_result):
+ return {
+ ('(Intercept)' if raw_name == 'Intercept' else raw_name): SingleTerm(
+ raw_name=raw_name,
+ beta=Estimate(raw_param, raw_ci[0], raw_ci[1]),
+ pvalue=raw_p
+ )
+ for raw_name, raw_param, raw_ci, raw_p in zip(raw_result.model.exog_names, raw_result.params.values, raw_result.conf_int(config.alpha).values, raw_result.pvalues.values)
+ }
+
+# ------------------------
+# Concrete implementations
+
+class Logit(RegressionModel):
+ @property
+ def model_long_name(self):
+ return 'Logistic Regression'
- :param model_class: Type of regression model to fit
- :type model_class: statsmodels model class
- :param df: Data to perform regression on
- :type df: DataFrame
- :param dep: Column in *df* for the dependent variable (numeric)
- :type dep: str
- :param formula: Patsy formula for the regression model
- :type formula: str
- :param exposure: Column in *df* for the exposure variable (numeric, some models only)
- :type exposure: str
- :param status: Column in *df* for the status variable (True/False or 1/0, time-to-event models only)
- :type status: str
- :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
- :type nan_policy: str
- :param model_kwargs: Keyword arguments to pass to *model_class* constructor
- :type model_kwargs: dict
- :param fit_kwargs: Keyword arguments to pass to *model.fit*
- :type fit_kwargs: dict
- :param family: See statsmodels *GLM* constructor
- :param cov_type: See statsmodels *model.fit*
- :param method: See statsmodels *model.fit*
- :param maxiter: See statsmodels *model.fit*
- :param start_params: See statsmodels *model.fit*
- :param bool_baselevels: Show reference categories for boolean independent variables even if reference category is *False*
- :type bool_baselevels: bool
- :param exp: Report exponentiated parameters rather than raw parameters, default (*None*) is to autodetect based on *model_class*
- :type exp: bool
+ @property
+ def model_short_name(self):
+ return 'Logit'
- :rtype: :class:`yli.regress.RegressionResult`
-
- **Example:**
-
- .. code-block::
+ @classmethod
+ def fit(cls, data_dep, data_ind):
+ result = cls()
+ result.exp = True
+ result.cov_type = 'nonrobust'
- df = pd.DataFrame({
- 'Unhealthy': [False, False, False, ...],
- 'Fibrinogen': [2.52, 2.46, 2.29, ...],
- 'GammaGlobulin': [38, 36, 36, ...]
- })
- yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
-
- .. code-block:: text
+ # Perform regression
+ raw_result = sm.Logit(endog=data_dep, exog=data_ind, missing='raise').fit(disp=False)
- Logistic Regression Results
- ======================================================
- Dep. Variable: Unhealthy | No. Observations: 32
- Model: Logit | Df. Model: 2
- Method: MLE | Df. Residuals: 29
- Date: 2022-10-18 | Pseudo R²: 0.26
- Time: 19:00:34 | LL-Model: -11.47
- Std. Errors: Non-Robust | LL-Null: -15.44
- | p (LR): 0.02*
- ======================================================
- exp(β) (95% CI) p
- -----------------------------------------------
- (Intercept) 0.00 (0.00 - 0.24) 0.03*
- Fibrinogen 6.80 (1.01 - 45.79) 0.049*
- GammaGlobulin 1.17 (0.92 - 1.48) 0.19
- -----------------------------------------------
-
- The output summarises the results of the regression.
- Note that the parameter estimates are automatically exponentiated.
- For example, the odds ratio for unhealthiness per unit increase in fibrinogen is 6.80, with 95% confidence interval 1.01–45.79, and is significant with *p* value 0.049.
- """
-
- # Populate model_kwargs
- if model_kwargs is None:
- model_kwargs = {}
- if family is not None:
- model_kwargs['family'] = family
-
- # Populate fit_kwargs
- if fit_kwargs is None:
- fit_kwargs = {}
- if cov_type is not None:
- fit_kwargs['cov_type'] = cov_type
- if method is not None:
- fit_kwargs['method'] = method
- if maxiter is not None:
- fit_kwargs['maxiter'] = maxiter
- if start_params is not None:
- fit_kwargs['start_params'] = start_params
-
- # Autodetect whether to exponentiate
- if exp is None:
- if model_class in (sm.Logit, sm.PHReg, sm.Poisson, OrdinalLogit, PenalisedLogit):
- exp = True
- else:
- exp = False
-
- df_ref = weakref.ref(df)
-
- if _dmatrices is None:
- # Check for/clean NaNs in input columns
- columns = [dep] + cols_for_formula(formula, df)
- if exposure is not None:
- columns.append(exposure)
- if status is not None:
- columns.append(status)
+ result.dof_model = raw_result.df_model
+ result.dof_resid = raw_result.df_resid
+ result.ll_model = raw_result.llf
+ result.ll_null = raw_result.llnull
- df = df[columns]
- df = check_nan(df, nan_policy)
+ result.terms = raw_terms_from_statsmodels_result(raw_result)
+ result.vcov = raw_result.cov_params()
- # Ensure numeric type for dependent variable
- df[dep], dep_categories = as_numeric(df[dep])
-
- # Convert pandas nullable types for independent variables as this breaks statsmodels
- df = convert_pandas_nullable(df)
-
- # Construct design matrix from formula
- dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe')
- else:
- dmatrices = _dmatrices
-
- if model_class in (sm.PHReg, OrdinalLogit):
- # Drop explicit intercept term
- # FIXME: Check before dropping
- dmatrices = (dmatrices[0], dmatrices[1].iloc[:,1:])
-
- # Add exposure to model
- if exposure is not None:
- if df[exposure].dtype == '{} | {:.2f} | {:.0f} | {} | '.format(raw_name, test.statistic, test.dof, fmt_p(test.pvalue, PValueStyle.TABULAR | PValueStyle.HTML))
-
- out += '
'
- return out
-
- def summary(self):
- """
- Return a stringified summary of the *χ*:sup:`2` test
-
- :rtype: str
- """
-
- table = pd.DataFrame([
- ['{:.2f}'.format(test.statistic), '{:.0f}'.format(test.dof), fmt_p(test.pvalue, PValueStyle.TABULAR)]
- for test in self.tests.values()
- ], index=self.tests.keys(), columns=['χ² ', 'df', 'p '])
-
- return str(table)
-
-# -----------------------------
-# Penalised logistic regression
-
-class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel):
- """
- statsmodels-compatible model for computing Firth penalised logistic regression
-
- Uses the R *logistf* library.
-
- This class should be used in conjunction with :func:`yli.regress`.
-
- **Example:**
-
- .. code-block::
-
- df = pd.DataFrame({
- 'Pred': [1] * 20 + [0] * 220,
- 'Outcome': [1] * 40 + [0] * 200
- })
- yli.regress(yli.PenalisedLogit, df, 'Outcome', 'Pred', exp=False)
-
- .. code-block:: text
-
- Penalised Logistic Regression Results
- =========================================================
- Dep. Variable: Outcome | No. Observations: 240
- Model: Logit | Df. Model: 1
- Method: Penalised ML | Pseudo R²: 0.37
- Date: 2022-10-19 | LL-Model: -66.43
- Time: 07:50:40 | LL-Null: -105.91
- Std. Errors: Non-Robust | p (LR): <0.001*
- =========================================================
- β (95% CI) p
- ---------------------------------------------
- (Intercept) -2.28 (-2.77 - -1.85) <0.001*
- Pred 5.99 (3.95 - 10.85) <0.001*
- ---------------------------------------------
-
- The output summarises the result of the regression.
- The summary table shows that a penalised method was applied.
- Note that because `exp=False` was passed, the parameter estimates are not automatically exponentiated.
- """
-
- def fit(self, disp=False):
- import rpy2.robjects as ro
- import rpy2.robjects.packages
- import rpy2.robjects.pandas2ri
-
- # Assume data is already cleaned from regress()
- df = self.data.orig_endog.join(self.data.orig_exog)
-
- # Convert bool to int otherwise rpy2 chokes
- df = df.replace({False: 0, True: 1})
-
- # Import logistf
- ro.packages.importr('logistf')
-
- with ro.conversion.localconverter(ro.default_converter + ro.pandas2ri.converter):
- with ro.local_context() as lc:
- # Convert DataFrame to R
- lc['df'] = df
-
- # Transfer other parameters to R
- lc['formula_'] = self.formula
- lc['alpha'] = config.alpha
-
- # Fit the model
- model = ro.r('logistf(formula_, data=df, alpha=alpha)')
-
- # TODO: Handle categorical terms?
- terms = {t: SingleTerm(t, Estimate(b, ci0, ci1), p) for t, b, ci0, ci1, p in zip(model['terms'], model['coefficients'], model['ci.lower'], model['ci.upper'], model['prob'])}
-
- return RegressionResult(
- None, None, None, None, None, None, None, # Set in regress()
- model,
- 'Penalised Logistic Regression', 'Logit', 'Penalised ML',
- model['n'][0], None, model['df'][0], datetime.now(), 'nonrobust',
- terms,
- model['loglik'][0], model['loglik'][1],
- None, None, None,
- [],
- None # Set exp in regress()
- )
-
-# ------------------------------------------------------
-# Ordinal logistic regression (R/Stata parameterisation)
-
-class OrdinalLogit(statsmodels.miscmodels.ordinal_model.OrderedModel):
- """
- statsmodels-compatible model for computing ordinal logistic (or probit) regression
-
- The implementation subclasses statsmodels' native *OrderedModel*, but substitutes an alternative parameterisation used by R and Stata.
+ The implementation calls statsmodels' native *OrderedModel*, but substitutes an alternative parameterisation used by R and Stata.
In the native statsmodels implementation, the first cutoff parameter is the true cutoff, but further cutoff parameter are log differences between consecutive cutoffs.
In this parameterisation, cutoff terms are represented directly in the model.
@@ -1196,6 +705,155 @@ class OrdinalLogit(statsmodels.miscmodels.ordinal_model.OrderedModel):
Note that because `exp=False` was passed, the parameter estimates are not automatically exponentiated.
"""
+ @property
+ def model_long_name(self):
+ return 'Ordinal Logistic Regression'
+
+ @property
+ def model_short_name(self):
+ return 'Ordinal Logit'
+
+ @classmethod
+ def fit(cls, data_dep, data_ind):
+ result = cls()
+ result.cov_type = 'nonrobust'
+
+ # Drop explicit intercept term
+ if 'Intercept' in data_ind:
+ del data_ind['Intercept']
+
+ # Perform regression
+ raw_result = _SMOrdinalLogit(endog=data_dep, exog=data_ind, missing='raise').fit(disp=False)
+
+ result.dof_model = raw_result.df_model
+ result.dof_resid = raw_result.df_resid
+ result.ll_model = raw_result.llf
+ result.ll_null = raw_result.llnull
+
+ raw_terms = raw_terms_from_statsmodels_result(raw_result)
+ result.terms = {}
+
+ # Group ordinal regression cutoffs
+ for raw_name, raw_term in raw_terms.items():
+ if '/' in raw_name:
+ if '(Cutoffs)' not in result.terms:
+ result.terms['(Cutoffs)'] = CategoricalTerm({}, None)
+
+ result.terms['(Cutoffs)'].categories[raw_name] = raw_term
+ else:
+ result.terms[raw_name] = raw_term
+
+ return result
+
+ def brant(self):
+ """
+ Perform the Brant test for the parallel regression assumption in ordinal regression
+
+ :rtype: :class:`BrantResult`
+
+ **Example:**
+
+ .. code-block::
+
+ df = pd.DataFrame(...)
+ model = yli.regress(yli.OrdinalLogit, df, 'apply', 'pared + public + gpa', exp=False)
+ model.brant()
+
+ .. code-block:: text
+
+ χ² df p
+ Omnibus 4.34 3 0.23
+ pared 0.13 1 0.72
+ public 3.44 1 0.06
+ gpa 0.18 1 0.67
+
+ The output shows the result of the Brant test. For example, for the omnibus test of the parallel regression assumption across all independent variables, the *χ*:sup:`2` statistic is 4.34, the *χ*:sup:`2` distribution has 3 degrees of freedom, and the test is not significant, with *p* value 0.23.
+
+ **Reference:** Brant R. Assessing proportionality in the proportional odds model for ordinal logistic regression. *Biometrics*. 1990;46(4):1171–8. `doi:10.2307/2532457 `_
+ """
+
+ df = self.df()
+ if df is None:
+ raise Exception('Referenced DataFrame has been dropped')
+ dep = self.dep
+
+ # Precompute design matrices
+ dmatrices, dep_categories = df_to_dmatrices(df, dep, self.formula, 'omit')
+ s_dep = dmatrices[0][dmatrices[0].columns[0]] # df[dep] as series - must get this from dmatrices to account for as_numeric
+ dmatrix_right = dmatrices[1]
+ dmatrix_right.reset_index(drop=True, inplace=True) # Otherwise this confuses matrix multiplication
+
+ # Fit individual logistic regressions
+ logit_models = []
+ for upper_limit in sorted(s_dep.unique())[:-1]:
+ dep_dichotomous = (s_dep <= upper_limit).astype(int).reset_index(drop=True)
+ logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False)
+
+ if not logit_result.mle_retvals['converged']:
+ raise Exception('Maximum likelihood estimation failed to converge for {} <= {}. Check raw_result.mle_retvals.'.format(dep, upper_limit))
+
+ if pd.isna(logit_result.bse).any():
+ raise Exception('Regression returned NaN standard errors for {} <= {}.'.format(dep, upper_limit))
+
+ logit_models.append(logit_result)
+
+ logit_betas = np.array([model._results.params for model in logit_models]).T
+ logit_pihat = np.array([expit(-model.fittedvalues) for model in logit_models]).T # Predicted probabilities
+
+ # vcov is the variance-covariance matrix of all individually fitted betas across all terms
+
+ # | model 1 | model 2 | model 3 | ...
+ # | term 1 | term 2 | term 1 | term 2 | term 1 | term 2 | ...
+ # model 1 | term 1 |
+ # | term 2 |
+ # model 2 | term 1 |
+ # | term 2 |
+ # ...
+
+ n_terms = len(dmatrix_right.columns) - 1 # number of beta terms (excluding intercept)
+ n_betas = len(logit_models) * n_terms
+ vcov = np.zeros((n_betas, n_betas))
+
+ # Populate the variance-covariance matrix for comparisons between individually fitted models
+ for j in range(0, len(logit_models) - 1):
+ for l in range(j + 1, len(logit_models)):
+ Wjj = np.diag(logit_pihat[:,j] - logit_pihat[:,j] * logit_pihat[:,j])
+ Wjl = np.diag(logit_pihat[:,l] - logit_pihat[:,j] * logit_pihat[:,l])
+ Wll = np.diag(logit_pihat[:,l] - logit_pihat[:,l] * logit_pihat[:,l])
+
+ matrix_result = np.linalg.inv(dmatrix_right.T @ Wjj @ dmatrix_right) @ dmatrix_right.T @ Wjl @ dmatrix_right @ np.linalg.inv(dmatrix_right.T @ Wll @ dmatrix_right)
+ j_vs_l_vcov = np.asarray(matrix_result)[1:,1:] # Asymptotic covariance for j,l
+
+ vcov[j*n_terms:(j+1)*n_terms, l*n_terms:(l+1)*n_terms] = j_vs_l_vcov
+ vcov[l*n_terms:(l+1)*n_terms, j*n_terms:(j+1)*n_terms] = j_vs_l_vcov
+
+ # Populate the variance-covariance matrix within each individually fitted model
+ for i in range(len(logit_models)):
+ vcov[i*n_terms:(i+1)*n_terms, i*n_terms:(i+1)*n_terms] = logit_models[i]._results.cov_params()[1:,1:]
+
+ # ------------------
+ # Perform Wald tests
+
+ beta_names = ['{}_{}'.format(raw_name, i) for i in range(len(logit_models)) for raw_name in dmatrix_right.columns[1:]]
+ wald_results = {}
+
+ # Omnibus test
+ constraints = [' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models))) for raw_name in dmatrix_right.columns[1:]]
+ constraint = ', '.join(constraints)
+ dof = (len(logit_models) - 1) * (len(dmatrix_right.columns) - 1) # df = (number of levels minus 2) * (number of terms excluding intercept)
+ wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, dof)
+ wald_results['Omnibus'] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue)
+
+ # Individual terms
+ for raw_name in dmatrix_right.columns[1:]:
+ constraint = ' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models)))
+ dof = len(logit_models) - 1 # df = (number of levels minus 2)
+ wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, dof)
+ wald_results[raw_name] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue)
+
+ return BrantResult(wald_results)
+
+class _SMOrdinalLogit(statsmodels.miscmodels.ordinal_model.OrderedModel):
def __init__(self, endog, exog, **kwargs):
if 'distr' not in kwargs:
kwargs['distr'] = 'logit'
@@ -1209,3 +867,143 @@ class OrdinalLogit(statsmodels.miscmodels.ordinal_model.OrderedModel):
def transform_reverse_threshold_params(self, params):
return params[:-1]
+
+class PenalisedLogit(RegressionModel):
+ """
+ Firth penalised logistic regression
+
+ Uses the R *logistf* library.
+
+ **Example:**
+
+ .. code-block::
+
+ df = pd.DataFrame({
+ 'Pred': [1] * 20 + [0] * 220,
+ 'Outcome': [1] * 40 + [0] * 200
+ })
+ yli.regress(yli.PenalisedLogit, df, 'Outcome', 'Pred', exp=False)
+
+ .. code-block:: text
+
+ Penalised Logistic Regression Results
+ =========================================================
+ Dep. Variable: Outcome | No. Observations: 240
+ Model: Logit | Df. Model: 1
+ Method: Penalised ML | Pseudo R²: 0.37
+ Date: 2022-10-19 | LL-Model: -66.43
+ Time: 07:50:40 | LL-Null: -105.91
+ Std. Errors: Non-Robust | p (LR): <0.001*
+ =========================================================
+ β (95% CI) p
+ ---------------------------------------------
+ (Intercept) -2.28 (-2.77 - -1.85) <0.001*
+ Pred 5.99 (3.95 - 10.85) <0.001*
+ ---------------------------------------------
+
+ The output summarises the result of the regression.
+ The summary table shows that a penalised method was applied.
+ Note that because `exp=False` was passed, the parameter estimates are not automatically exponentiated.
+ """
+
+ @property
+ def model_long_name(self):
+ return 'Penalised Logistic Regression'
+
+ @property
+ def model_short_name(self):
+ return 'Penalised Logit'
+
+ @classmethod
+ def fit(cls, data_dep, data_ind):
+ result = cls()
+ result.cov_type = 'nonrobust'
+
+ import rpy2.robjects as ro
+ import rpy2.robjects.packages
+ import rpy2.robjects.pandas2ri
+
+ df = data_dep.join(data_ind)
+
+ # Drop explicit intercept term
+ if 'Intercept' in df:
+ del df['Intercept']
+
+ # Import logistf
+ ro.packages.importr('logistf')
+
+ with ro.conversion.localconverter(ro.default_converter + ro.pandas2ri.converter):
+ with ro.local_context() as lc:
+ # Convert DataFrame to R
+ lc['df'] = df
+
+ # Transfer other parameters to R
+ lc['formula_'] = df.columns[0] + ' ~ ' + ' + '.join(df.columns[1:])
+ lc['alpha'] = config.alpha
+
+ # Fit the model
+ model = ro.r('logistf(formula_, data=df, alpha=alpha)')
+
+ result.dof_model = model['df'][0]
+ result.ll_model = model['loglik'][0]
+ result.ll_null = model['loglik'][1]
+
+ result.terms = {t: SingleTerm(t, Estimate(b, ci0, ci1), p) for t, b, ci0, ci1, p in zip(model['terms'], model['coefficients'], model['ci.lower'], model['ci.upper'], model['prob'])}
+
+ return result
+
+# ------------------
+# Brant test helpers
+
+class _Dummy: pass
+
+def _wald_test(param_names, params, formula, vcov, df):
+ # Hack! Piggyback off statsmodels to compute a Wald test
+
+ lmr = statsmodels.base.model.LikelihoodModelResults(model=None, params=None)
+ lmr.model = _Dummy()
+ lmr.model.data = _Dummy()
+ lmr.model.data.cov_names = param_names
+ lmr.params = params
+ lmr.df_resid = df
+
+ return lmr.wald_test(formula, cov_p=vcov, use_f=False, scalar=True)
+
+class BrantResult:
+ """
+ Result of a Brant test for ordinal regression
+
+ See :meth:`OrdinalLogit.brant`.
+ """
+
+ def __init__(self, tests):
+ #: Results for Brant test on each coefficient (*Dict[str, ChiSquaredResult]*)
+ self.tests = tests
+
+ def __repr__(self):
+ if config.repr_is_summary:
+ return self.summary()
+ return super().__repr__()
+
+ def _repr_html_(self):
+ out = 'Brant Test Results | χ2 | df | p |
'
+
+ for raw_name, test in self.tests.items():
+ out += '{} | {:.2f} | {:.0f} | {} |
'.format(raw_name, test.statistic, test.dof, fmt_p(test.pvalue, PValueStyle.TABULAR | PValueStyle.HTML))
+
+ out += '
'
+ return out
+
+ def summary(self):
+ """
+ Return a stringified summary of the *χ*:sup:`2` test
+
+ :rtype: str
+ """
+
+ table = pd.DataFrame([
+ ['{:.2f}'.format(test.statistic), '{:.0f}'.format(test.dof), fmt_p(test.pvalue, PValueStyle.TABULAR)]
+ for test in self.tests.values()
+ ], index=self.tests.keys(), columns=['χ² ', 'df', 'p '])
+
+ return str(table)
diff --git a/yli/survival.py b/yli/survival.py
index 539f741..ed27489 100644
--- a/yli/survival.py
+++ b/yli/survival.py
@@ -20,11 +20,7 @@ import statsmodels.api as sm
from .config import config
from .sig_tests import ChiSquaredResult
-from .regress import RegressionResult, SingleTerm
-from .utils import Estimate, check_nan, cols_for_formula, convert_pandas_nullable
-
-from datetime import datetime
-import weakref
+from .utils import Estimate, check_nan
def kaplanmeier(df, time, status, by=None, *, ci=True, transform_x=None, transform_y=None, nan_policy='warn'):
"""
@@ -276,91 +272,3 @@ def logrank(df, time, status, by, nan_policy='warn'):
statistic, pvalue = sm.duration.survdiff(df[time], df[status], df[by])
return ChiSquaredResult(statistic=statistic, dof=1, pvalue=pvalue)
-
-# --------------------------------
-# Interval-censored Cox regression
-
-def cox_interval_censored(
- df, time_left, time_right, formula, *,
- bootstrap_samples=100,
- nan_policy='warn',
- bool_baselevels=False, exp=True,
-):
- # TODO: Documentation
-
- df_ref = weakref.ref(df)
-
- # Check for/clean NaNs in input columns
- columns = [time_left, time_right] + cols_for_formula(formula, df)
-
- df = df[columns]
- df = check_nan(df, nan_policy)
-
- # FIXME: Ensure numeric type for dependent variable
- #df[dep], dep_categories = as_numeric(df[dep])
- if df[time_left].dtype != 'float64' or df[time_right].dtype != 'float64':
- raise NotImplementedError('Time dtypes must be float64')
-
- # Convert pandas nullable types for independent variables
- df = convert_pandas_nullable(df)
-
- # ---------
- # Fit model
-
- # lifelines.CoxPHFitter doesn't do confidence intervals so we use R
-
- import rpy2.robjects as ro
- import rpy2.robjects.packages
- import rpy2.robjects.pandas2ri
-
- # Convert bool to int otherwise rpy2 chokes
- df = df.replace({False: 0, True: 1})
-
- # Import icenReg
- ro.packages.importr('icenReg')
-
- with ro.conversion.localconverter(ro.default_converter + ro.pandas2ri.converter):
- with ro.local_context() as lc:
- # Convert DataFrame to R
- lc['df'] = df
-
- # Transfer other parameters to R
- lc['formula_'] = 'Surv({}, {}, type="interval2") ~ {}'.format(time_left, time_right, formula)
- lc['bootstrap_samples'] = bootstrap_samples
-
- # FIXME: Seed bootstrap RNG?
-
- # Fit the model
- ro.r('model <- ic_sp(as.formula(formula_), data=df, bs_samples=bootstrap_samples)')
-
- model = ro.r('model')
- # Hard to access attributes through rpy2
- term_parameters = ro.r('model$coef')
- term_names = ro.r('names(model$coef)')
- term_cis = ro.r('confint(model)')
- cov_matrix = ro.r('model$var')
- llf = ro.r('model$llk')[0]
-
- # TODO: Handle categorical terms?
- terms = {}
- for i in range(len(term_parameters)):
- # These values not directly exposed so we must calculate them
- se = np.sqrt(cov_matrix[i, i])
- pvalue = 2 * stats.norm(loc=0, scale=se).cdf(-np.abs(term_parameters[i]))
-
- term = SingleTerm(term_names[i], Estimate(term_parameters[i], term_cis[i][0], term_cis[i][1]), pvalue)
- terms[term_names[i]] = term
-
- result = RegressionResult(
- None, df_ref, '({}, {}]'.format(time_left, time_right), formula, nan_policy, None, None,
- model,
- 'Interval-Censored Cox Regression', 'CoxIC', 'MLE',
- len(df), None, None, datetime.now(), 'Bootstrap',
- terms,
- llf, None,
- None, None, None,
- [],
- exp
- )
-
- return result