diff --git a/tests/test_anova.py b/tests/test_anova.py index 441c59e..4d188c4 100644 --- a/tests/test_anova.py +++ b/tests/test_anova.py @@ -1,5 +1,5 @@ # scipy-yli: Helpful SciPy utilities and recipes -# Copyright © 2022 Lee Yingtong Li (RunasSudo) +# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by @@ -17,7 +17,6 @@ from pytest import approx import pandas as pd -import statsmodels.api as sm import yli @@ -44,7 +43,7 @@ def test_regress_ftest_ol8_2(): 'Score': [96, 79, 91, 85, 83, 91, 82, 87, 77, 76, 74, 73, 78, 71, 80, 66, 73, 69, 66, 77, 73, 71, 70, 74] }) - result = yli.regress(sm.OLS, df, 'Score', 'C(Method)').ftest() + result = yli.regress(yli.OLS, df, 'Score', 'C(Method)').ftest() assert result.statistic == approx(545.316/18.4366, rel=0.001) assert result.dof1 == 2 diff --git a/tests/test_bayes_factors.py b/tests/test_bayes_factors.py index 26582d0..f7f5758 100644 --- a/tests/test_bayes_factors.py +++ b/tests/test_bayes_factors.py @@ -1,5 +1,5 @@ # scipy-yli: Helpful SciPy utilities and recipes -# Copyright © 2022 Lee Yingtong Li (RunasSudo) +# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by @@ -17,7 +17,6 @@ from pytest import approx import pandas as pd -import statsmodels.api as sm import yli @@ -30,7 +29,7 @@ def test_afbf_logit_beta_zero(): 'GammaGlobulin': [38, 36, 36, 36, 30, 31, 36, 37, 31, 38, 29, 46, 46, 31, 35, 37, 33, 37, 37, 34, 44, 28, 31, 39, 37, 39, 32, 38, 36, 32, 41, 30] }) - result = yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin') + result = yli.regress(yli.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin') # model <- glm(Unhealthy ~ Fibrinogen + GammaGlobulin, data=df, family=binomial()) # bf_fit <- BF(model, hypothesis="Fibrinogen = 0") diff --git a/tests/test_chi2.py b/tests/test_chi2.py index 0685fc7..579e537 100644 --- a/tests/test_chi2.py +++ b/tests/test_chi2.py @@ -1,5 +1,5 @@ # scipy-yli: Helpful SciPy utilities and recipes -# Copyright © 2022 Lee Yingtong Li (RunasSudo) +# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by @@ -68,10 +68,10 @@ def test_chi2_ol10_18(): assert result.oddsratio.ci_lower == approx(1.113, abs=0.001) assert result.oddsratio.ci_upper == approx(1.596, abs=0.001) - expected_summary = '''Stress False True -Response -False 250 400 -True 750 1600 + expected_summary = '''Stress False True +Response +False 250 400 +True 750 1600 χ²(1) = 9.82; p = 0.002* OR (95% CI) = 1.33 (1.11–1.60) diff --git a/tests/test_ordinallogit.py b/tests/test_ordinallogit.py index 67f0831..b2ab173 100644 --- a/tests/test_ordinallogit.py +++ b/tests/test_ordinallogit.py @@ -41,16 +41,16 @@ def test_ordinallogit_ucla(): assert result.terms['(Cutoffs)'].categories['somewhat likely/very likely'].beta.ci_lower == approx(2.72234, abs=0.001) assert result.terms['(Cutoffs)'].categories['somewhat likely/very likely'].beta.ci_upper == approx(5.875195, abs=0.001) - expected_summary = ''' Ordinal Logistic Regression Results -=============================================================== -Dep. Variable: apply | No. Observations: 400 - Model: OrdinalLogit | Df. Model: 5 - Method: Maximum Likelihood | Df. Residuals: 395 - Date: {0:%Y-%m-%d} | Pseudo R²: 0.03 - Time: {0:%H:%M:%S} | LL-Model: -358.51 - Std. Errors: Non-Robust | LL-Null: -370.60 - | p (LR): <0.001* -=============================================================== + expected_summary = ''' Ordinal Logistic Regression Results +========================================================== +Dep. Variable: apply | No. Observations: 400 + Model: Ordinal Logit | Df. Model: 5 + Date: {0:%Y-%m-%d} | Df. Residuals: 395 + Time: {0:%H:%M:%S} | Pseudo R²: 0.03 + Std. Errors: Non-Robust | LL-Model: -358.51 + | LL-Null: -370.60 + | p (LR): <0.001* +============================================================ β (95% CI) p ------------------------------------------------------------ pared 1.05 (0.53 - 1.57) <0.001* diff --git a/tests/test_regress.py b/tests/test_regress.py index eccd8c5..a9777bf 100644 --- a/tests/test_regress.py +++ b/tests/test_regress.py @@ -1,5 +1,5 @@ # scipy-yli: Helpful SciPy utilities and recipes -# Copyright © 2022 Lee Yingtong Li (RunasSudo) +# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by @@ -14,11 +14,11 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . +import pytest from pytest import approx import numpy as np import pandas as pd -import statsmodels.api as sm import yli from yli.regress import CategoricalTerm @@ -31,7 +31,7 @@ def test_regress_ols_ol11_4(): 'GrowthRet': [17.78, 21.59, 23.84, 15.13, 23.45, 20.87, 17.78, 20.09, 17.78, 12.46, 14.95, 15.87, 17.45, 14.35, 14.64, 17.25, 12.57, 7.15, 7.50, 4.34] }) - result = yli.regress(sm.OLS, df, 'GrowthRet', 'SoilPh') + result = yli.regress(yli.OLS, df, 'GrowthRet', 'SoilPh') assert result.dof_model == 1 assert result.dof_resid == 18 @@ -46,9 +46,27 @@ def test_regress_ols_ol11_4(): assert result.terms['SoilPh'].beta.ci_lower == approx(-10.15, abs=0.01) assert result.terms['SoilPh'].beta.ci_upper == approx(-5.57, abs=0.01) + + expected_summary = ''' Ordinary Least Squares Regression Results +======================================================= +Dep. Variable: GrowthRet | No. Observations: 20 + Model: OLS | Df. Model: 1 + Date: {0:%Y-%m-%d} | Df. Residuals: 18 + Time: {0:%H:%M:%S} | R²: 0.74 + Std. Errors: Non-Robust | F: 52.01 + | p (F): <0.001* +======================================================= + β (95% CI) p +---------------------------------------------- +(Intercept) 47.48 (38.17 - 56.78) <0.001* + SoilPh -7.86 (-10.15 - -5.57) <0.001* +----------------------------------------------'''.format(result.fitted_dt) + + assert result.summary() == expected_summary +@pytest.mark.skip('Not implemented in refactored regression implementation') def test_regress_bootstrap_ols_ol11_4(): - """Compare RegressionResult.bootstrap for Ott & Longnecker (2016) example 11.4/11.7""" + """Compare RegressionModel.bootstrap for Ott & Longnecker (2016) example 11.4/11.7""" df = pd.DataFrame({ 'SoilPh': [3.3, 3.4, 3.4, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 5.0, 5.1, 5.2], @@ -86,7 +104,7 @@ def test_regress_ols_ol13_5(): }) df['LNC'] = np.log(df['C']) - result = yli.regress(sm.OLS, df, 'LNC', 'D + T1 + T2 + S + PR + NE + CT + BW + N + PT') + result = yli.regress(yli.OLS, df, 'LNC', 'D + T1 + T2 + S + PR + NE + CT + BW + N + PT') assert result.dof_model == 10 assert result.dof_resid == 21 @@ -126,7 +144,7 @@ def test_regress_logit_ol12_23(): 'GammaGlobulin': [38, 36, 36, 36, 30, 31, 36, 37, 31, 38, 29, 46, 46, 31, 35, 37, 33, 37, 37, 34, 44, 28, 31, 39, 37, 39, 32, 38, 36, 32, 41, 30] }) - result = yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin') + result = yli.regress(yli.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin') # Some numerical differences as intercept term is very negative lrtest_result = result.lrtest_null() @@ -152,10 +170,10 @@ def test_regress_logit_ol12_23(): ====================================================== Dep. Variable: Unhealthy | No. Observations: 32 Model: Logit | Df. Model: 2 - Method: MLE | Df. Residuals: 29 - Date: {0:%Y-%m-%d} | Pseudo R²: 0.26 - Time: {0:%H:%M:%S} | LL-Model: -11.47 - Std. Errors: Non-Robust | LL-Null: -15.44 + Date: {0:%Y-%m-%d} | Df. Residuals: 29 + Time: {0:%H:%M:%S} | Pseudo R²: 0.26 + Std. Errors: Non-Robust | LL-Model: -11.47 + | LL-Null: -15.44 | p (LR): 0.02* ====================================================== exp(β) (95% CI) p @@ -182,7 +200,7 @@ def test_regress_logit_ol10_18(): 'Stress': np.repeat([d[1] for d in data], [d[2] for d in data]) }) - result = yli.regress(sm.Logit, df, 'Stress', 'Response', bool_baselevels=True) + result = yli.regress(yli.Logit, df, 'Stress', 'Response', bool_baselevels=True) assert isinstance(result.terms['Response'], CategoricalTerm) assert result.terms['Response'].ref_category == False @@ -217,15 +235,15 @@ def test_regress_penalisedlogit_kleinman(): assert lrtest_result.dof == 1 assert lrtest_result.pvalue < 0.0001 - expected_summary = ''' Penalised Logistic Regression Results -========================================================= -Dep. Variable: Outcome | No. Observations: 240 - Model: Logit | Df. Model: 1 - Method: Penalised ML | Pseudo R²: 0.37 - Date: {0:%Y-%m-%d} | LL-Model: -66.43 - Time: {0:%H:%M:%S} | LL-Null: -105.91 - Std. Errors: Non-Robust | p (LR): <0.001* -========================================================= + expected_summary = ''' Penalised Logistic Regression Results +============================================================ +Dep. Variable: Outcome | No. Observations: 240 + Model: Penalised Logit | Df. Model: 1 + Date: {0:%Y-%m-%d} | Pseudo R²: 0.37 + Time: {0:%H:%M:%S} | LL-Model: -66.43 + Std. Errors: Non-Robust | LL-Null: -105.91 + | p (LR): <0.001* +============================================================ β (95% CI) p --------------------------------------------- (Intercept) -2.28 (-2.77 - -1.85) <0.001* diff --git a/yli/__init__.py b/yli/__init__.py index c93f752..eedef22 100644 --- a/yli/__init__.py +++ b/yli/__init__.py @@ -20,9 +20,9 @@ from .descriptives import auto_correlations, auto_descriptives from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist from .graphs import init_fonts from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted -from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, vif +from .regress import Logit, OLS, OrdinalLogit, PenalisedLogit, regress, vif from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, spearman, ttest_ind -from .survival import cox_interval_censored, kaplanmeier, logrank, turnbull +from .survival import kaplanmeier, logrank, turnbull from .utils import as_ordinal def reload_me(): diff --git a/yli/regress.py b/yli/regress.py index 4631ea5..303babd 100644 --- a/yli/regress.py +++ b/yli/regress.py @@ -23,11 +23,12 @@ import statsmodels, statsmodels.base.model, statsmodels.miscmodels.ordinal_model import statsmodels.api as sm from statsmodels.iolib.table import SimpleTable from statsmodels.stats.outliers_influence import variance_inflation_factor -from tqdm import tqdm from datetime import datetime +import io import itertools -import warnings +import json +import subprocess import weakref from .bayes_factors import BayesFactor, bayesfactor_afbf @@ -36,6 +37,9 @@ from .shap import ShapResult from .sig_tests import ChiSquaredResult, FTestResult from .utils import Estimate, PValueStyle, as_numeric, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term +# TODO: Documentation +# TODO: Bootstrap + def vif(df, formula=None, *, nan_policy='warn'): """ Calculate the variance inflation factor (VIF) for each variable in *df* @@ -93,405 +97,176 @@ def vif(df, formula=None, *, nan_policy='warn'): return pd.Series(vifs) -# ---------- -# Regression +def regress(model_class, df, dep, formula, *, nan_policy='warn', bool_baselevels=False, exp=None): + """ + Fit a statsmodels regression model + + :param model_class: Type of regression model to fit + :type model_class: :class:`RegressionModel` subclass + :param df: Data to perform regression on + :type df: DataFrame + :param dep: Column in *df* for the dependent variable (numeric) + :type dep: str + :param formula: Patsy formula for the regression model + :type formula: str + :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) + :type nan_policy: str + :param bool_baselevels: Show reference categories for boolean independent variables even if reference category is *False* + :type bool_baselevels: bool + :param exp: Report exponentiated parameters rather than raw parameters, default (*None*) is to autodetect based on *model_class* + :type exp: bool + + :rtype: :class:`RegressionModel` + + **Example:** + + .. code-block:: + + df = pd.DataFrame({ + 'Unhealthy': [False, False, False, ...], + 'Fibrinogen': [2.52, 2.46, 2.29, ...], + 'GammaGlobulin': [38, 36, 36, ...] + }) + yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin') + + .. code-block:: text + + Logistic Regression Results + ====================================================== + Dep. Variable: Unhealthy | No. Observations: 32 + Model: Logit | Df. Model: 2 + Date: 2022-10-18 | Df. Residuals: 29 + Time: 19:00:34 | Pseudo R²: 0.26 + Std. Errors: Non-Robust | LL-Model: -11.47 + | LL-Null: -15.44 + | p (LR): 0.02* + ====================================================== + exp(β) (95% CI) p + ----------------------------------------------- + (Intercept) 0.00 (0.00 - 0.24) 0.03* + Fibrinogen 6.80 (1.01 - 45.79) 0.049* + GammaGlobulin 1.17 (0.92 - 1.48) 0.19 + ----------------------------------------------- + + The output summarises the results of the regression. + Note that the parameter estimates are automatically exponentiated. + For example, the odds ratio for unhealthiness per unit increase in fibrinogen is 6.80, with 95% confidence interval 1.01–45.79, and is significant with *p* value 0.049. + """ + + if not any(x.__name__ == 'RegressionModel' for x in model_class.__bases__): + raise ValueError('model_class must be a RegressionModel') + + df_ref = weakref.ref(df) + dmatrices, dep_categories = df_to_dmatrices(df, dep, formula, nan_policy) + + # Fit model + result = model_class.fit(dmatrices[0], dmatrices[1]) + result.df = df_ref + result.dep = dep + result.formula = formula + result.nan_policy = nan_policy + if exp is not None: + result.exp = exp + result.fitted_dt = datetime.now() + result.nobs = len(dmatrices[0]) + + # Process categorical terms, etc. + raw_terms = result.terms + result.terms = {} + + for raw_name, raw_term in raw_terms.items(): + # Rename terms + if raw_name == '(Intercept)': + # Already processed + result.terms[raw_name] = raw_term + elif raw_name == '(Cutoffs)': + result.terms[raw_name] = raw_term + + if dep_categories is not None: + # Need to convert factorised names back into original names + old_cutoffs = raw_term.categories + raw_term.categories = {} + + for cutoff_raw_name, cutoff_term in old_cutoffs.items(): + bits = cutoff_raw_name.split('/') + term = dep_categories[round(float(bits[0]))] + '/' + dep_categories[round(float(bits[1]))] + result.terms[raw_name].categories[term] = cutoff_term + else: + # Parse if required + factor, column, contrast = parse_patsy_term(formula, df, raw_name) + + if contrast is not None: + # Categorical term + + if bool_baselevels is False and contrast == 'True' and set(df[column].unique()) == set([True, False]): + # Treat as single term + result.terms[column] = raw_term + else: + # Add a new categorical term if not exists + if column not in result.terms: + ref_category = formula_factor_ref_category(formula, df, factor) + result.terms[column] = CategoricalTerm({}, ref_category) + + result.terms[column].categories[contrast] = raw_term + else: + # Single term + result.terms[column] = raw_term + + return result -class LikelihoodRatioTestResult(ChiSquaredResult): - """ - Result of a likelihood ratio test for regression +def df_to_dmatrices(df, dep, formula, nan_policy): + # Check for/clean NaNs in input columns + columns = [dep] + cols_for_formula(formula, df) - See :meth:`RegressionResult.lrtest_null`. - """ + df = df[columns] + df = check_nan(df, nan_policy) - def __init__(self, statistic, dof, pvalue): - super().__init__(statistic, dof, pvalue) + # Ensure numeric type for dependent variable + df[dep], dep_categories = as_numeric(df[dep]) - def _repr_html_(self): - return 'LR({}) = {:.2f}; p {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML)) + # Convert pandas nullable types for independent variables as this breaks statsmodels + df = convert_pandas_nullable(df) - def summary(self): - """ - Return a stringified summary of the likelihood ratio test - - :rtype: str - """ - - return 'LR({}) = {:.2f}; p {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION)) + # Construct design matrix from formula + dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe') + + return dmatrices, dep_categories -class RegressionResult: - """ - Result of a regression - - See :func:`yli.regress`. - """ - - def __init__(self, - model_class, df, dep, formula, nan_policy, model_kwargs, fit_kwargs, - raw_result, - full_name, model_name, fit_method, - nobs, nevents, dof_model, fitted_dt, cov_type, - terms, - ll_model, ll_null, - dof_resid, rsquared, f_statistic, - comments, - exp - ): - # Data about how model fitted - #: See :func:`yli.regress` - self.model_class = model_class - #: Data fitted (*weakref* to *DataFrame*) - self.df = df - #: See :func:`yli.regress` - self.dep = dep - #: See :func:`yli.regress` - self.formula = formula - #: See :func:`yli.regress` - self.nan_policy = nan_policy - #: See :func:`yli.regress` - self.model_kwargs = model_kwargs - #: See :func:`yli.regress` - self.fit_kwargs = fit_kwargs +class RegressionModel: + def __init__(self): + # Model configuration (all set in yli.regress) + self.df = None + self.dep = None + self.formula = None + self.nan_policy = None + self.exp = False + self.cov_type = None - #: Raw result from statsmodels *model.fit* - self.raw_result = raw_result + # Summary information + self.fitted_dt = None # Set in yli.regress + self.nobs = None # Set in yli.regress + self.nevents = None + self.dof_model = None + self.dof_resid = None + self.rsquared = None + self.ll_model = None + self.ll_null = None + self.f_statistic = None - # Information for display - #: Full name of the regression model type (*str*) - self.full_name = full_name - #: Short name of the regression model type (*str*) - self.model_name = model_name - #: Method for fitting the regression model (*str*) - self.fit_method = fit_method + # Parameters + self.terms = None + self.vcov = None - # Basic fitted model information - #: Number of observations (*int*) - self.nobs = nobs - #: Number of events (*int*, time-to-event models only) - self.nevents = nevents - #: Degrees of freedom for the model (*int*) - self.dof_model = dof_model - #: Date and time of fitting the model (Python *datetime*) - self.fitted_dt = fitted_dt - #: Method for computing the covariance matrix (*str*) - self.cov_type = cov_type - - # Regression coefficients/p values - #: Coefficients and *p* values for each term in the model (*dict* of :class:`SingleTerm` or :class:`CategoricalTerm`) - self.terms = terms - - # Model log-likelihood - #: Log-likelihood of fitted model (*float*) - self.ll_model = ll_model - #: Log-likelihood of null model (*float*) - self.ll_null = ll_null - - # Extra statistics (not all regression models have these) - #: Degrees of freedom for the residuals (*int*; *None* if N/A) - self.dof_resid = dof_resid - #: *R*:sup:`2` statistic (*float*; *None* if N/A) - self.rsquared = rsquared - #: *F* statistic (*float*; *None* if N/A) - self.f_statistic = f_statistic - - #: Comments for the model (*List[str]*) - self.comments = comments or [] - - # Config for display style - #: See :func:`yli.regress` - self.exp = exp + # Comments + self.comments = [] @property - def pseudo_rsquared(self): - """McFadden's pseudo *R*:sup:`2` statistic""" - - return 1 - self.ll_model/self.ll_null + def model_long_name(self): + return None - def lrtest_null(self): - """ - Compute the likelihood ratio test comparing the model to the null model - - :rtype: :class:`LikelihoodRatioTestResult` - """ - - statistic = -2 * (self.ll_null - self.ll_model) - pvalue = 1 - stats.chi2.cdf(statistic, self.dof_model) - - return LikelihoodRatioTestResult(statistic, self.dof_model, pvalue) - - def ftest(self): - """ - Perform the *F* test that all slopes are 0 - - :rtype: :class:`yli.sig_tests.FTestResult` - """ - - pvalue = 1 - stats.f(self.dof_model, self.dof_resid).cdf(self.f_statistic) - return FTestResult(self.f_statistic, self.dof_model, self.dof_resid, pvalue) - - def bayesfactor_beta_zero(self, term): - """ - Compute a Bayes factor testing the hypothesis that the given beta is 0 - - Uses the R *BFpack* library. - - Requires the regression to be from statsmodels. - The term must be specified as the *raw name* from the statsmodels regression, available via :attr:`RegressionResult.raw_result`. - - :param term: Raw name of the term to be tested - :type term: str - - :rtype: :class:`yli.bayes_factors.BayesFactor` - """ - - # FIXME: Allow specifying our renamed terms - - # Get parameters required for AFBF - params = pd.Series({raw_name.replace('[', '_').replace(']', '_'): beta for raw_name, beta in self.raw_result.params.items()}) - cov = self.raw_result.cov_params() - - # Compute BF matrix - bf01 = bayesfactor_afbf(params, cov, self.nobs, '{} = 0'.format(term.replace('[', '_').replace(']', '_'))) - bf01 = BayesFactor(bf01.factor, '0', '{} = 0'.format(term), '1', '{} ≠ 0'.format(term)) - - if bf01.factor >= 1: - return bf01 - else: - return bf01.invert() - - def brant(self): - """ - Perform the Brant test for the parallel regression assumption in ordinal regression - - Applicable when the model was fitted using :class:`OrdinalLogit`. - - :rtype: :class:`BrantResult` - - **Example:** - - .. code-block:: - - df = pd.DataFrame(...) - model = yli.regress(yli.OrdinalLogit, df, 'apply', 'pared + public + gpa', exp=False) - model.brant() - - .. code-block:: text - - χ² df p - Omnibus 4.34 3 0.23 - pared 0.13 1 0.72 - public 3.44 1 0.06 - gpa 0.18 1 0.67 - - The output shows the result of the Brant test. For example, for the omnibus test of the parallel regression assumption across all independent variables, the *χ*:sup:`2` statistic is 4.34, the *χ*:sup:`2` distribution has 3 degrees of freedom, and the test is not significant, with *p* value 0.23. - - **Reference:** Brant R. Assessing proportionality in the proportional odds model for ordinal logistic regression. *Biometrics*. 1990;46(4):1171–8. `doi:10.2307/2532457 `_ - """ - - df = self.df() - if df is None: - raise Exception('Referenced DataFrame has been dropped') - dep = self.dep - - # Check for/clean NaNs - # NaN warning/error will already have been handled in regress, so here we pass nan_policy='omit' - # Following this, we pass nan_policy='raise' to assert no NaNs remaining - df = df[[dep] + cols_for_formula(self.formula, df)] - df = check_nan(df, 'omit') - - # Ensure numeric type for dependent variable - df[dep], dep_categories = as_numeric(df[dep]) - - # Convert pandas nullable types for independent variables as this breaks statsmodels - df = convert_pandas_nullable(df) - - # Precompute design matrix for RHS - # This is also X+ in Brant paper - dmatrix_right = patsy.dmatrix(self.formula, df, return_type='dataframe') - dmatrix_right.reset_index(drop=True, inplace=True) # Otherwise this confuses matrix multiplication - - # Fit individual logistic regressions - logit_models = [] - for upper_limit in sorted(df[dep].unique())[:-1]: - dep_dichotomous = (df[dep] <= upper_limit).astype(int).reset_index(drop=True) - logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False, **self.fit_kwargs) - - if not logit_result.mle_retvals['converged']: - raise Exception('Maximum likelihood estimation failed to converge for {} <= {}. Check raw_result.mle_retvals.'.format(dep, upper_limit)) - - if pd.isna(logit_result.bse).any(): - raise Exception('Regression returned NaN standard errors for {} <= {}.'.format(dep, upper_limit)) - - logit_models.append(logit_result) - - logit_betas = np.array([model._results.params for model in logit_models]).T - logit_pihat = np.array([expit(-model.fittedvalues) for model in logit_models]).T # Predicted probabilities - - # vcov is the variance-covariance matrix of all individually fitted betas across all terms - - # | model 1 | model 2 | model 3 | ... - # | term 1 | term 2 | term 1 | term 2 | term 1 | term 2 | ... - # model 1 | term 1 | - # | term 2 | - # model 2 | term 1 | - # | term 2 | - # ... - - n_terms = len(dmatrix_right.columns) - 1 # number of beta terms (excluding intercept) - n_betas = len(logit_models) * n_terms - vcov = np.zeros((n_betas, n_betas)) - - # Populate the variance-covariance matrix for comparisons between individually fitted models - for j in range(0, len(logit_models) - 1): - for l in range(j + 1, len(logit_models)): - Wjj = np.diag(logit_pihat[:,j] - logit_pihat[:,j] * logit_pihat[:,j]) - Wjl = np.diag(logit_pihat[:,l] - logit_pihat[:,j] * logit_pihat[:,l]) - Wll = np.diag(logit_pihat[:,l] - logit_pihat[:,l] * logit_pihat[:,l]) - - matrix_result = np.linalg.inv(dmatrix_right.T @ Wjj @ dmatrix_right) @ dmatrix_right.T @ Wjl @ dmatrix_right @ np.linalg.inv(dmatrix_right.T @ Wll @ dmatrix_right) - j_vs_l_vcov = np.asarray(matrix_result)[1:,1:] # Asymptotic covariance for j,l - - vcov[j*n_terms:(j+1)*n_terms, l*n_terms:(l+1)*n_terms] = j_vs_l_vcov - vcov[l*n_terms:(l+1)*n_terms, j*n_terms:(j+1)*n_terms] = j_vs_l_vcov - - # Populate the variance-covariance matrix within each individually fitted model - for i in range(len(logit_models)): - vcov[i*n_terms:(i+1)*n_terms, i*n_terms:(i+1)*n_terms] = logit_models[i]._results.cov_params()[1:,1:] - - # ------------------ - # Perform Wald tests - - beta_names = ['{}_{}'.format(raw_name, i) for i in range(len(logit_models)) for raw_name in dmatrix_right.columns[1:]] - wald_results = {} - - # Omnibus test - constraints = [' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models))) for raw_name in dmatrix_right.columns[1:]] - constraint = ', '.join(constraints) - df = (len(logit_models) - 1) * (len(dmatrix_right.columns) - 1) # df = (number of levels minus 2) * (number of terms excluding intercept) - wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, df) - wald_results['Omnibus'] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue) - - # Individual terms - for raw_name in dmatrix_right.columns[1:]: - constraint = ' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models))) - df = len(logit_models) - 1 # df = (number of levels minus 2) - wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, df) - wald_results[raw_name] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue) - - return BrantResult(wald_results) - - def bootstrap(self, samples=1000): - """ - Use bootstrapping to recompute confidence intervals and *p* values for the terms in the regression model - - :param samples: Number of bootstrap samples to draw - :type samples: int - - :rtype: :class:`yli.regress.RegressionResult` - """ - - df = self.df() - if df is None: - raise Exception('Referenced DataFrame has been dropped') - dep = self.dep - - # Check for/clean NaNs - # NaN warning/error will already have been handled in regress, so here we pass nan_policy='omit' - # Following this, we pass nan_policy='raise' to assert no NaNs remaining - df = df[[dep] + cols_for_formula(self.formula, df)] - df = check_nan(df, 'omit') - - # Ensure numeric type for dependent variable - df[dep], dep_categories = as_numeric(df[dep]) - - # Convert pandas nullable types for independent variables as this breaks statsmodels - df = convert_pandas_nullable(df) - - # Precompute design matrices - dmatrices = patsy.dmatrices(dep + ' ~ ' + self.formula, df, return_type='dataframe') - - # Fit full model - #full_model = regress(self.model_class, df, dep, self.formula, nan_policy='raise', _dmatrices=dmatrices, model_kwargs=self.model_kwargs, fit_kwargs=self.fit_kwargs) - - # Initialise bootstrap_results - bootstrap_results = {} # Dict mapping term raw names to bootstrap betas - for term in self.terms.values(): - if isinstance(term, SingleTerm): - bootstrap_results[term.raw_name] = [] - else: - for sub_term in term.categories.values(): - bootstrap_results[sub_term.raw_name] = [] - - # Draw bootstrap samples and regress - dmatrices = dmatrices[0].join(dmatrices[1]) - - for i in tqdm(range(samples)): - bootstrap_rows = dmatrices.sample(len(df), replace=True) - model = self.model_class(endog=bootstrap_rows.iloc[:,0], exog=bootstrap_rows.iloc[:,1:], **self.model_kwargs) - model.formula = dep + ' ~ ' + self.formula - - result = model.fit(disp=False, **self.fit_kwargs) - - for raw_name, raw_beta in zip(model.exog_names, result._results.params): - bootstrap_results[raw_name].append(raw_beta) - - # Combine bootstrap results - terms = {} - for term_name, term in self.terms.items(): - if isinstance(term, SingleTerm): - bootstrap_betas = bootstrap_results[term.raw_name] - bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas) - bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue) - terms[term_name] = SingleTerm(term.raw_name, Estimate(term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue) - else: - categories = {} - for sub_term_name, sub_term in term.categories.items(): - bootstrap_betas = bootstrap_results[sub_term.raw_name] - bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas) - bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue) - categories[sub_term_name] = SingleTerm(sub_term.raw_name, Estimate(sub_term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue) - terms[term_name] = CategoricalTerm(categories, term.ref_category) - - return RegressionResult( - self.model_class, self.df, dep, self.formula, self.nan_policy, self.model_kwargs, self.fit_kwargs, - None, - self.full_name, self.model_name, self.fit_method, - self.nobs, None, self.dof_model, datetime.now(), 'Bootstrap', - terms, - self.ll_model, self.ll_null, - self.dof_resid, self.rsquared, self.f_statistic, - self.comments, - self.exp - ) - - def shap(self, **kwargs): - """ - Compute SHAP values for the model - - Uses the Python *shap* library. - - :param kwargs: Keyword arguments to pass to *shap.LinearExplainer* - - :rtype: :class:`yli.shap.ShapResult` - - **Reference:** Lundberg SM, Lee SI. A unified approach to interpreting model predictions. In: Guyon I, Von Luxburg U, Bengio S, et al., editors. *Advances in Neural Information Processing Systems*; 2017 Dec 4–9; Long Beach, CA. https://proceedings.neurips.cc/paper/2017/hash/8a20a8621978632d76c43dfd28b67767-Abstract.html - """ - - import shap - - xdata = ShapResult._get_xdata(self) - - # Combine terms into single list - params = [] - for term in self.terms.values(): - if isinstance(term, SingleTerm): - params.append(term.beta.point) - else: - params.extend(s.beta.point for s in term.categories.values()) - - explainer = shap.LinearExplainer((np.array(params[1:]), params[0]), xdata, **kwargs) # FIXME: Assumes zeroth term is intercept - shap_values = explainer.shap_values(xdata).astype('float') - - return ShapResult(weakref.ref(self), shap_values, list(xdata.columns)) + @property + def model_short_name(self): + return None def _header_table(self, html): """Return the entries for the header table""" @@ -500,8 +275,7 @@ class RegressionResult: left_col = [] left_col.append(('Dep. Variable:', self.dep)) - left_col.append(('Model:', self.model_name)) - left_col.append(('Method:', self.fit_method)) + left_col.append(('Model:', self.model_short_name)) left_col.append(('Date:', self.fitted_dt.strftime('%Y-%m-%d'))) left_col.append(('Time:', self.fitted_dt.strftime('%H:%M:%S'))) if self.cov_type: @@ -555,7 +329,7 @@ class RegressionResult: # Render header table left_col, right_col = self._header_table(html=True) - out = ''.format(self.full_name) + out = '
{} Results
'.format(self.model_long_name) for left_cell, right_cell in itertools.zip_longest(left_col, right_col): out += ''.format( left_cell[0] if left_cell else '', @@ -625,7 +399,7 @@ class RegressionResult: elif len(left_col) > len(right_col): right_col.extend([('', '')] * (len(left_col) - len(right_col))) - table1 = SimpleTable(np.concatenate([left_col, right_col], axis=1), title='{} Results'.format(self.full_name)) + table1 = SimpleTable(np.concatenate([left_col, right_col], axis=1), title='{} Results'.format(self.model_long_name)) table1.insert_stubs(2, [' | '] * len(left_col)) # Get rid of last line (merge with next table) @@ -681,12 +455,130 @@ class RegressionResult: out += '\n{}. {}'.format(i + 1, comment) return out + + # -------------------- + # Post hoc tests, etc. + + def bayesfactor_beta_zero(self, term): + """ + Compute a Bayes factor testing the hypothesis that the given beta is 0 + + Uses the R *BFpack* library. + + Requires the regression to be from statsmodels. + The term must be specified as the *raw name* from the statsmodels regression, available via :attr:`RegressionResult.raw_result`. + + :param term: Raw name of the term to be tested + :type term: str + + :rtype: :class:`yli.bayes_factors.BayesFactor` + """ + + # FIXME: Allow specifying our renamed terms + + # Get parameters required for AFBF + raw_params = {} + for t in self.terms.values(): + if isinstance(t, CategoricalTerm): + for t in t.categories.values(): + raw_params[t.raw_name.replace('[', '_').replace(']', '_')] = t.beta.point + else: + raw_params[t.raw_name.replace('[', '_').replace(']', '_')] = t.beta.point + + # Compute BF matrix + bf01 = bayesfactor_afbf(pd.Series(raw_params), self.vcov, self.nobs, '{} = 0'.format(term.replace('[', '_').replace(']', '_'))) + bf01 = BayesFactor(bf01.factor, '0', '{} = 0'.format(term), '1', '{} ≠ 0'.format(term)) + + if bf01.factor >= 1: + return bf01 + else: + return bf01.invert() + + def ftest(self): + """ + Perform the *F* test that all slopes are 0 + + :rtype: :class:`yli.sig_tests.FTestResult` + """ + + pvalue = 1 - stats.f(self.dof_model, self.dof_resid).cdf(self.f_statistic) + return FTestResult(self.f_statistic, self.dof_model, self.dof_resid, pvalue) + + def lrtest_null(self): + """ + Compute the likelihood ratio test comparing the model to the null model + + :rtype: :class:`LikelihoodRatioTestResult` + """ + + statistic = -2 * (self.ll_null - self.ll_model) + pvalue = 1 - stats.chi2.cdf(statistic, self.dof_model) + LikelihoodRatioTestResult + return LikelihoodRatioTestResult(statistic, self.dof_model, pvalue) + + @property + def pseudo_rsquared(self): + """McFadden's pseudo *R*:sup:`2` statistic""" + + return 1 - self.ll_model/self.ll_null + + def shap(self, **kwargs): + """ + Compute SHAP values for the model + + Uses the Python *shap* library. + + :param kwargs: Keyword arguments to pass to *shap.LinearExplainer* + + :rtype: :class:`yli.shap.ShapResult` + + **Reference:** Lundberg SM, Lee SI. A unified approach to interpreting model predictions. In: Guyon I, Von Luxburg U, Bengio S, et al., editors. *Advances in Neural Information Processing Systems*; 2017 Dec 4–9; Long Beach, CA. https://proceedings.neurips.cc/paper/2017/hash/8a20a8621978632d76c43dfd28b67767-Abstract.html + """ + + import shap + + xdata = ShapResult._get_xdata(self) + + # Combine terms into single list + params = [] + for term in self.terms.values(): + if isinstance(term, SingleTerm): + params.append(term.beta.point) + else: + params.extend(s.beta.point for s in term.categories.values()) + + explainer = shap.LinearExplainer((np.array(params[1:]), params[0]), xdata, **kwargs) # FIXME: Assumes zeroth term is intercept + shap_values = explainer.shap_values(xdata).astype('float') + + return ShapResult(weakref.ref(self), shap_values, list(xdata.columns)) + +class LikelihoodRatioTestResult(ChiSquaredResult): + """ + Result of a likelihood ratio test for regression + + See :meth:`RegressionResult.lrtest_null`. + """ + + def __init__(self, statistic, dof, pvalue): + super().__init__(statistic, dof, pvalue) + + def _repr_html_(self): + return 'LR({}) = {:.2f}; p {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML)) + + def summary(self): + """ + Return a stringified summary of the likelihood ratio test + + :rtype: str + """ + + return 'LR({}) = {:.2f}; p {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION)) class SingleTerm: - """A term in a :class:`RegressionResult` which is a single term""" + """A term in a :class:`RegressionModel` which is a single term""" def __init__(self, raw_name, beta, pvalue): - #: Raw name of the term (*str*; e.g. in :attr:`RegressionResult.raw_result`) + #: Raw name of the term (*str*; e.g. in :attr:`RegressionModel.raw_result`) self.raw_name = raw_name #: :class:`yli.utils.Estimate` of the coefficient self.beta = beta @@ -694,7 +586,7 @@ class SingleTerm: self.pvalue = pvalue class CategoricalTerm: - """A group of terms in a :class:`RegressionResult` corresponding to a categorical variable""" + """A group of terms in a :class:`RegressionModel` corresponding to a categorical variable""" def __init__(self, categories, ref_category): #: Terms for each of the categories, excluding the reference category (*dict* of :class:`SingleTerm`) @@ -702,463 +594,80 @@ class CategoricalTerm: #: Name of the reference category (*str*) self.ref_category = ref_category -def regress( - model_class, df, dep, formula, *, - nan_policy='warn', - model_kwargs=None, fit_kwargs=None, - family=None, exposure=None, status=None, # common model_kwargs - cov_type=None, method=None, maxiter=None, start_params=None, # common fit_kwargs - bool_baselevels=False, exp=None, - _dmatrices=None, -): - """ - Fit a statsmodels regression model +def raw_terms_from_statsmodels_result(raw_result): + return { + ('(Intercept)' if raw_name == 'Intercept' else raw_name): SingleTerm( + raw_name=raw_name, + beta=Estimate(raw_param, raw_ci[0], raw_ci[1]), + pvalue=raw_p + ) + for raw_name, raw_param, raw_ci, raw_p in zip(raw_result.model.exog_names, raw_result.params.values, raw_result.conf_int(config.alpha).values, raw_result.pvalues.values) + } + +# ------------------------ +# Concrete implementations + +class Logit(RegressionModel): + @property + def model_long_name(self): + return 'Logistic Regression' - :param model_class: Type of regression model to fit - :type model_class: statsmodels model class - :param df: Data to perform regression on - :type df: DataFrame - :param dep: Column in *df* for the dependent variable (numeric) - :type dep: str - :param formula: Patsy formula for the regression model - :type formula: str - :param exposure: Column in *df* for the exposure variable (numeric, some models only) - :type exposure: str - :param status: Column in *df* for the status variable (True/False or 1/0, time-to-event models only) - :type status: str - :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) - :type nan_policy: str - :param model_kwargs: Keyword arguments to pass to *model_class* constructor - :type model_kwargs: dict - :param fit_kwargs: Keyword arguments to pass to *model.fit* - :type fit_kwargs: dict - :param family: See statsmodels *GLM* constructor - :param cov_type: See statsmodels *model.fit* - :param method: See statsmodels *model.fit* - :param maxiter: See statsmodels *model.fit* - :param start_params: See statsmodels *model.fit* - :param bool_baselevels: Show reference categories for boolean independent variables even if reference category is *False* - :type bool_baselevels: bool - :param exp: Report exponentiated parameters rather than raw parameters, default (*None*) is to autodetect based on *model_class* - :type exp: bool + @property + def model_short_name(self): + return 'Logit' - :rtype: :class:`yli.regress.RegressionResult` - - **Example:** - - .. code-block:: + @classmethod + def fit(cls, data_dep, data_ind): + result = cls() + result.exp = True + result.cov_type = 'nonrobust' - df = pd.DataFrame({ - 'Unhealthy': [False, False, False, ...], - 'Fibrinogen': [2.52, 2.46, 2.29, ...], - 'GammaGlobulin': [38, 36, 36, ...] - }) - yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin') - - .. code-block:: text + # Perform regression + raw_result = sm.Logit(endog=data_dep, exog=data_ind, missing='raise').fit(disp=False) - Logistic Regression Results - ====================================================== - Dep. Variable: Unhealthy | No. Observations: 32 - Model: Logit | Df. Model: 2 - Method: MLE | Df. Residuals: 29 - Date: 2022-10-18 | Pseudo R²: 0.26 - Time: 19:00:34 | LL-Model: -11.47 - Std. Errors: Non-Robust | LL-Null: -15.44 - | p (LR): 0.02* - ====================================================== - exp(β) (95% CI) p - ----------------------------------------------- - (Intercept) 0.00 (0.00 - 0.24) 0.03* - Fibrinogen 6.80 (1.01 - 45.79) 0.049* - GammaGlobulin 1.17 (0.92 - 1.48) 0.19 - ----------------------------------------------- - - The output summarises the results of the regression. - Note that the parameter estimates are automatically exponentiated. - For example, the odds ratio for unhealthiness per unit increase in fibrinogen is 6.80, with 95% confidence interval 1.01–45.79, and is significant with *p* value 0.049. - """ - - # Populate model_kwargs - if model_kwargs is None: - model_kwargs = {} - if family is not None: - model_kwargs['family'] = family - - # Populate fit_kwargs - if fit_kwargs is None: - fit_kwargs = {} - if cov_type is not None: - fit_kwargs['cov_type'] = cov_type - if method is not None: - fit_kwargs['method'] = method - if maxiter is not None: - fit_kwargs['maxiter'] = maxiter - if start_params is not None: - fit_kwargs['start_params'] = start_params - - # Autodetect whether to exponentiate - if exp is None: - if model_class in (sm.Logit, sm.PHReg, sm.Poisson, OrdinalLogit, PenalisedLogit): - exp = True - else: - exp = False - - df_ref = weakref.ref(df) - - if _dmatrices is None: - # Check for/clean NaNs in input columns - columns = [dep] + cols_for_formula(formula, df) - if exposure is not None: - columns.append(exposure) - if status is not None: - columns.append(status) + result.dof_model = raw_result.df_model + result.dof_resid = raw_result.df_resid + result.ll_model = raw_result.llf + result.ll_null = raw_result.llnull - df = df[columns] - df = check_nan(df, nan_policy) + result.terms = raw_terms_from_statsmodels_result(raw_result) + result.vcov = raw_result.cov_params() - # Ensure numeric type for dependent variable - df[dep], dep_categories = as_numeric(df[dep]) - - # Convert pandas nullable types for independent variables as this breaks statsmodels - df = convert_pandas_nullable(df) - - # Construct design matrix from formula - dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe') - else: - dmatrices = _dmatrices - - if model_class in (sm.PHReg, OrdinalLogit): - # Drop explicit intercept term - # FIXME: Check before dropping - dmatrices = (dmatrices[0], dmatrices[1].iloc[:,1:]) - - # Add exposure to model - if exposure is not None: - if df[exposure].dtype == '`_ + """ + + df = self.df() + if df is None: + raise Exception('Referenced DataFrame has been dropped') + dep = self.dep + + # Precompute design matrices + dmatrices, dep_categories = df_to_dmatrices(df, dep, self.formula, 'omit') + s_dep = dmatrices[0][dmatrices[0].columns[0]] # df[dep] as series - must get this from dmatrices to account for as_numeric + dmatrix_right = dmatrices[1] + dmatrix_right.reset_index(drop=True, inplace=True) # Otherwise this confuses matrix multiplication + + # Fit individual logistic regressions + logit_models = [] + for upper_limit in sorted(s_dep.unique())[:-1]: + dep_dichotomous = (s_dep <= upper_limit).astype(int).reset_index(drop=True) + logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False) + + if not logit_result.mle_retvals['converged']: + raise Exception('Maximum likelihood estimation failed to converge for {} <= {}. Check raw_result.mle_retvals.'.format(dep, upper_limit)) + + if pd.isna(logit_result.bse).any(): + raise Exception('Regression returned NaN standard errors for {} <= {}.'.format(dep, upper_limit)) + + logit_models.append(logit_result) + + logit_betas = np.array([model._results.params for model in logit_models]).T + logit_pihat = np.array([expit(-model.fittedvalues) for model in logit_models]).T # Predicted probabilities + + # vcov is the variance-covariance matrix of all individually fitted betas across all terms + + # | model 1 | model 2 | model 3 | ... + # | term 1 | term 2 | term 1 | term 2 | term 1 | term 2 | ... + # model 1 | term 1 | + # | term 2 | + # model 2 | term 1 | + # | term 2 | + # ... + + n_terms = len(dmatrix_right.columns) - 1 # number of beta terms (excluding intercept) + n_betas = len(logit_models) * n_terms + vcov = np.zeros((n_betas, n_betas)) + + # Populate the variance-covariance matrix for comparisons between individually fitted models + for j in range(0, len(logit_models) - 1): + for l in range(j + 1, len(logit_models)): + Wjj = np.diag(logit_pihat[:,j] - logit_pihat[:,j] * logit_pihat[:,j]) + Wjl = np.diag(logit_pihat[:,l] - logit_pihat[:,j] * logit_pihat[:,l]) + Wll = np.diag(logit_pihat[:,l] - logit_pihat[:,l] * logit_pihat[:,l]) + + matrix_result = np.linalg.inv(dmatrix_right.T @ Wjj @ dmatrix_right) @ dmatrix_right.T @ Wjl @ dmatrix_right @ np.linalg.inv(dmatrix_right.T @ Wll @ dmatrix_right) + j_vs_l_vcov = np.asarray(matrix_result)[1:,1:] # Asymptotic covariance for j,l + + vcov[j*n_terms:(j+1)*n_terms, l*n_terms:(l+1)*n_terms] = j_vs_l_vcov + vcov[l*n_terms:(l+1)*n_terms, j*n_terms:(j+1)*n_terms] = j_vs_l_vcov + + # Populate the variance-covariance matrix within each individually fitted model + for i in range(len(logit_models)): + vcov[i*n_terms:(i+1)*n_terms, i*n_terms:(i+1)*n_terms] = logit_models[i]._results.cov_params()[1:,1:] + + # ------------------ + # Perform Wald tests + + beta_names = ['{}_{}'.format(raw_name, i) for i in range(len(logit_models)) for raw_name in dmatrix_right.columns[1:]] + wald_results = {} + + # Omnibus test + constraints = [' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models))) for raw_name in dmatrix_right.columns[1:]] + constraint = ', '.join(constraints) + dof = (len(logit_models) - 1) * (len(dmatrix_right.columns) - 1) # df = (number of levels minus 2) * (number of terms excluding intercept) + wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, dof) + wald_results['Omnibus'] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue) + + # Individual terms + for raw_name in dmatrix_right.columns[1:]: + constraint = ' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models))) + dof = len(logit_models) - 1 # df = (number of levels minus 2) + wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, dof) + wald_results[raw_name] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue) + + return BrantResult(wald_results) + +class _SMOrdinalLogit(statsmodels.miscmodels.ordinal_model.OrderedModel): def __init__(self, endog, exog, **kwargs): if 'distr' not in kwargs: kwargs['distr'] = 'logit' @@ -1209,3 +867,143 @@ class OrdinalLogit(statsmodels.miscmodels.ordinal_model.OrderedModel): def transform_reverse_threshold_params(self, params): return params[:-1] + +class PenalisedLogit(RegressionModel): + """ + Firth penalised logistic regression + + Uses the R *logistf* library. + + **Example:** + + .. code-block:: + + df = pd.DataFrame({ + 'Pred': [1] * 20 + [0] * 220, + 'Outcome': [1] * 40 + [0] * 200 + }) + yli.regress(yli.PenalisedLogit, df, 'Outcome', 'Pred', exp=False) + + .. code-block:: text + + Penalised Logistic Regression Results + ========================================================= + Dep. Variable: Outcome | No. Observations: 240 + Model: Logit | Df. Model: 1 + Method: Penalised ML | Pseudo R²: 0.37 + Date: 2022-10-19 | LL-Model: -66.43 + Time: 07:50:40 | LL-Null: -105.91 + Std. Errors: Non-Robust | p (LR): <0.001* + ========================================================= + β (95% CI) p + --------------------------------------------- + (Intercept) -2.28 (-2.77 - -1.85) <0.001* + Pred 5.99 (3.95 - 10.85) <0.001* + --------------------------------------------- + + The output summarises the result of the regression. + The summary table shows that a penalised method was applied. + Note that because `exp=False` was passed, the parameter estimates are not automatically exponentiated. + """ + + @property + def model_long_name(self): + return 'Penalised Logistic Regression' + + @property + def model_short_name(self): + return 'Penalised Logit' + + @classmethod + def fit(cls, data_dep, data_ind): + result = cls() + result.cov_type = 'nonrobust' + + import rpy2.robjects as ro + import rpy2.robjects.packages + import rpy2.robjects.pandas2ri + + df = data_dep.join(data_ind) + + # Drop explicit intercept term + if 'Intercept' in df: + del df['Intercept'] + + # Import logistf + ro.packages.importr('logistf') + + with ro.conversion.localconverter(ro.default_converter + ro.pandas2ri.converter): + with ro.local_context() as lc: + # Convert DataFrame to R + lc['df'] = df + + # Transfer other parameters to R + lc['formula_'] = df.columns[0] + ' ~ ' + ' + '.join(df.columns[1:]) + lc['alpha'] = config.alpha + + # Fit the model + model = ro.r('logistf(formula_, data=df, alpha=alpha)') + + result.dof_model = model['df'][0] + result.ll_model = model['loglik'][0] + result.ll_null = model['loglik'][1] + + result.terms = {t: SingleTerm(t, Estimate(b, ci0, ci1), p) for t, b, ci0, ci1, p in zip(model['terms'], model['coefficients'], model['ci.lower'], model['ci.upper'], model['prob'])} + + return result + +# ------------------ +# Brant test helpers + +class _Dummy: pass + +def _wald_test(param_names, params, formula, vcov, df): + # Hack! Piggyback off statsmodels to compute a Wald test + + lmr = statsmodels.base.model.LikelihoodModelResults(model=None, params=None) + lmr.model = _Dummy() + lmr.model.data = _Dummy() + lmr.model.data.cov_names = param_names + lmr.params = params + lmr.df_resid = df + + return lmr.wald_test(formula, cov_p=vcov, use_f=False, scalar=True) + +class BrantResult: + """ + Result of a Brant test for ordinal regression + + See :meth:`OrdinalLogit.brant`. + """ + + def __init__(self, tests): + #: Results for Brant test on each coefficient (*Dict[str, ChiSquaredResult]*) + self.tests = tests + + def __repr__(self): + if config.repr_is_summary: + return self.summary() + return super().__repr__() + + def _repr_html_(self): + out = '
{} Results
{}{}{}{}
' + + for raw_name, test in self.tests.items(): + out += ''.format(raw_name, test.statistic, test.dof, fmt_p(test.pvalue, PValueStyle.TABULAR | PValueStyle.HTML)) + + out += '
Brant Test Results
χ2dfp
{}{:.2f}{:.0f}{}
' + return out + + def summary(self): + """ + Return a stringified summary of the *χ*:sup:`2` test + + :rtype: str + """ + + table = pd.DataFrame([ + ['{:.2f}'.format(test.statistic), '{:.0f}'.format(test.dof), fmt_p(test.pvalue, PValueStyle.TABULAR)] + for test in self.tests.values() + ], index=self.tests.keys(), columns=['χ² ', 'df', 'p ']) + + return str(table) diff --git a/yli/survival.py b/yli/survival.py index 539f741..ed27489 100644 --- a/yli/survival.py +++ b/yli/survival.py @@ -20,11 +20,7 @@ import statsmodels.api as sm from .config import config from .sig_tests import ChiSquaredResult -from .regress import RegressionResult, SingleTerm -from .utils import Estimate, check_nan, cols_for_formula, convert_pandas_nullable - -from datetime import datetime -import weakref +from .utils import Estimate, check_nan def kaplanmeier(df, time, status, by=None, *, ci=True, transform_x=None, transform_y=None, nan_policy='warn'): """ @@ -276,91 +272,3 @@ def logrank(df, time, status, by, nan_policy='warn'): statistic, pvalue = sm.duration.survdiff(df[time], df[status], df[by]) return ChiSquaredResult(statistic=statistic, dof=1, pvalue=pvalue) - -# -------------------------------- -# Interval-censored Cox regression - -def cox_interval_censored( - df, time_left, time_right, formula, *, - bootstrap_samples=100, - nan_policy='warn', - bool_baselevels=False, exp=True, -): - # TODO: Documentation - - df_ref = weakref.ref(df) - - # Check for/clean NaNs in input columns - columns = [time_left, time_right] + cols_for_formula(formula, df) - - df = df[columns] - df = check_nan(df, nan_policy) - - # FIXME: Ensure numeric type for dependent variable - #df[dep], dep_categories = as_numeric(df[dep]) - if df[time_left].dtype != 'float64' or df[time_right].dtype != 'float64': - raise NotImplementedError('Time dtypes must be float64') - - # Convert pandas nullable types for independent variables - df = convert_pandas_nullable(df) - - # --------- - # Fit model - - # lifelines.CoxPHFitter doesn't do confidence intervals so we use R - - import rpy2.robjects as ro - import rpy2.robjects.packages - import rpy2.robjects.pandas2ri - - # Convert bool to int otherwise rpy2 chokes - df = df.replace({False: 0, True: 1}) - - # Import icenReg - ro.packages.importr('icenReg') - - with ro.conversion.localconverter(ro.default_converter + ro.pandas2ri.converter): - with ro.local_context() as lc: - # Convert DataFrame to R - lc['df'] = df - - # Transfer other parameters to R - lc['formula_'] = 'Surv({}, {}, type="interval2") ~ {}'.format(time_left, time_right, formula) - lc['bootstrap_samples'] = bootstrap_samples - - # FIXME: Seed bootstrap RNG? - - # Fit the model - ro.r('model <- ic_sp(as.formula(formula_), data=df, bs_samples=bootstrap_samples)') - - model = ro.r('model') - # Hard to access attributes through rpy2 - term_parameters = ro.r('model$coef') - term_names = ro.r('names(model$coef)') - term_cis = ro.r('confint(model)') - cov_matrix = ro.r('model$var') - llf = ro.r('model$llk')[0] - - # TODO: Handle categorical terms? - terms = {} - for i in range(len(term_parameters)): - # These values not directly exposed so we must calculate them - se = np.sqrt(cov_matrix[i, i]) - pvalue = 2 * stats.norm(loc=0, scale=se).cdf(-np.abs(term_parameters[i])) - - term = SingleTerm(term_names[i], Estimate(term_parameters[i], term_cis[i][0], term_cis[i][1]), pvalue) - terms[term_names[i]] = term - - result = RegressionResult( - None, df_ref, '({}, {}]'.format(time_left, time_right), formula, nan_policy, None, None, - model, - 'Interval-Censored Cox Regression', 'CoxIC', 'MLE', - len(df), None, None, datetime.now(), 'Bootstrap', - terms, - llf, None, - None, None, None, - [], - exp - ) - - return result