2022-10-13 15:57:56 +11:00
# scipy-yli: Helpful SciPy utilities and recipes
2023-02-25 15:07:42 +11:00
# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
2022-10-13 15:57:56 +11:00
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import numpy as np
2022-10-13 16:24:01 +11:00
import pandas as pd
2022-11-29 14:40:50 +11:00
import patsy
2022-10-13 15:57:56 +11:00
from scipy import stats
2022-12-02 20:19:08 +11:00
from scipy . special import expit
import statsmodels , statsmodels . miscmodels . ordinal_model
2022-10-13 15:57:56 +11:00
import statsmodels . api as sm
from statsmodels . iolib . table import SimpleTable
2022-10-13 16:24:01 +11:00
from statsmodels . stats . outliers_influence import variance_inflation_factor
2022-11-29 23:00:14 +11:00
from tqdm import tqdm
2022-10-13 15:57:56 +11:00
from datetime import datetime
import itertools
2022-10-18 19:25:12 +11:00
import warnings
2022-12-02 20:46:00 +11:00
import weakref
2022-10-13 15:57:56 +11:00
2022-10-14 14:09:29 +11:00
from . bayes_factors import BayesFactor , bayesfactor_afbf
2022-10-15 23:24:29 +11:00
from . config import config
2023-02-07 18:50:07 +11:00
from . shap import ShapResult
2022-12-02 20:20:25 +11:00
from . sig_tests import ChiSquaredResult , FTestResult
2022-12-02 21:42:41 +11:00
from . utils import Estimate , PValueStyle , as_numeric , check_nan , cols_for_formula , convert_pandas_nullable , fmt_p , formula_factor_ref_category , parse_patsy_term
2022-10-13 15:57:56 +11:00
2022-10-17 21:41:19 +11:00
def vif ( df , formula = None , * , nan_policy = ' warn ' ) :
2022-10-14 14:08:51 +11:00
"""
2022-10-17 21:41:19 +11:00
Calculate the variance inflation factor ( VIF ) for each variable in * df *
2022-10-14 14:08:51 +11:00
2022-10-17 21:41:19 +11:00
: param df : Data to calculate the VIF for
: type df : DataFrame
: param formula : If specified , calculate the VIF only for the variables in the formula
: type formula : str
2022-10-18 19:25:12 +11:00
: return : The variance inflation factors
: rtype : Series
* * Example : * *
. . code - block : :
df = pd . DataFrame ( {
' D ' : [ 68.58 , 67.33 , 67.33 , . . . ] ,
' T1 ' : [ 14 , 10 , 10 , . . . ] ,
' T2 ' : [ 46 , 73 , 85 , . . . ] ,
. . .
} )
yli . vif ( df [ [ ' D ' , ' T1 ' , ' T2 ' , . . . ] ] )
. . code - block : : text
D 8.318301
T1 6.081590
T2 2.457122
. . .
dtype : float64
The output shows the variance inflation factor for each variable in * df * .
2022-10-14 14:08:51 +11:00
"""
if formula :
# Only consider columns in the formula
2022-10-15 01:09:40 +11:00
df = df [ cols_for_formula ( formula , df ) ]
2022-10-13 16:24:01 +11:00
# Check for/clean NaNs
df = check_nan ( df , nan_policy )
# Convert all to float64 otherwise statsmodels chokes with "ufunc 'isfinite' not supported for the input types ..."
df = pd . get_dummies ( df , drop_first = True ) # Convert categorical dtypes
df = df . astype ( ' float64 ' ) # Convert all other dtypes
# Add intercept column
orig_columns = list ( df . columns )
df [ ' Intercept ' ] = [ 1 ] * len ( df )
vifs = { }
for i , col in enumerate ( orig_columns ) :
vifs [ col ] = variance_inflation_factor ( df , i )
return pd . Series ( vifs )
2022-10-13 15:57:56 +11:00
# ----------
# Regression
2022-12-02 20:20:25 +11:00
class LikelihoodRatioTestResult ( ChiSquaredResult ) :
2022-10-17 21:41:19 +11:00
"""
Result of a likelihood ratio test for regression
See : meth : ` RegressionResult . lrtest_null ` .
"""
2022-10-13 15:57:56 +11:00
def __init__ ( self , statistic , dof , pvalue ) :
2022-12-02 20:20:25 +11:00
super ( ) . __init__ ( statistic , dof , pvalue )
2022-10-18 18:44:04 +11:00
2022-10-13 15:57:56 +11:00
def _repr_html_ ( self ) :
2022-11-09 17:39:33 +11:00
return ' LR( {} ) = {:.2f} ; <i>p</i> {} ' . format ( self . dof , self . statistic , fmt_p ( self . pvalue , PValueStyle . RELATION | PValueStyle . HTML ) )
2022-10-13 15:57:56 +11:00
def summary ( self ) :
2022-10-17 21:41:19 +11:00
"""
Return a stringified summary of the likelihood ratio test
: rtype : str
"""
2022-11-09 17:39:33 +11:00
return ' LR( {} ) = {:.2f} ; p {} ' . format ( self . dof , self . statistic , fmt_p ( self . pvalue , PValueStyle . RELATION ) )
2022-10-13 15:57:56 +11:00
class RegressionResult :
"""
Result of a regression
2022-10-17 21:41:19 +11:00
See : func : ` yli . regress ` .
2022-10-13 15:57:56 +11:00
"""
def __init__ ( self ,
2022-12-02 21:01:07 +11:00
model_class , df , dep , formula , nan_policy , model_kwargs , fit_kwargs ,
2022-10-13 15:57:56 +11:00
raw_result ,
full_name , model_name , fit_method ,
2023-02-25 14:46:13 +11:00
nobs , nevents , dof_model , fitted_dt , cov_type ,
2022-10-14 21:57:22 +11:00
terms ,
2022-12-03 02:02:34 +11:00
ll_model , ll_null ,
2022-10-13 15:57:56 +11:00
dof_resid , rsquared , f_statistic ,
2022-12-02 12:53:40 +11:00
comments ,
2022-10-13 15:57:56 +11:00
exp
) :
2022-12-02 20:46:00 +11:00
# Data about how model fitted
2022-12-02 21:01:07 +11:00
#: See :func:`yli.regress`
2022-12-02 20:46:00 +11:00
self . model_class = model_class
#: Data fitted (*weakref* to *DataFrame*)
self . df = df
2022-12-02 21:01:07 +11:00
#: See :func:`yli.regress`
2022-12-02 20:46:00 +11:00
self . dep = dep
2022-12-02 21:01:07 +11:00
#: See :func:`yli.regress`
2022-12-02 20:46:00 +11:00
self . formula = formula
2022-12-02 21:01:07 +11:00
#: See :func:`yli.regress`
self . nan_policy = nan_policy
#: See :func:`yli.regress`
self . model_kwargs = model_kwargs
#: See :func:`yli.regress`
self . fit_kwargs = fit_kwargs
2022-12-02 20:46:00 +11:00
2022-10-17 21:41:19 +11:00
#: Raw result from statsmodels *model.fit*
2022-10-13 15:57:56 +11:00
self . raw_result = raw_result
# Information for display
2022-10-17 21:41:19 +11:00
#: Full name of the regression model type (*str*)
2022-10-13 15:57:56 +11:00
self . full_name = full_name
2022-10-17 21:41:19 +11:00
#: Short name of the regression model type (*str*)
2022-10-13 15:57:56 +11:00
self . model_name = model_name
2022-10-17 21:41:19 +11:00
#: Method for fitting the regression model (*str*)
2022-10-13 15:57:56 +11:00
self . fit_method = fit_method
# Basic fitted model information
2022-10-17 21:41:19 +11:00
#: Number of observations (*int*)
2022-10-13 15:57:56 +11:00
self . nobs = nobs
2023-02-25 14:46:13 +11:00
#: Number of events (*int*, time-to-event models only)
self . nevents = nevents
2022-10-17 21:41:19 +11:00
#: Degrees of freedom for the model (*int*)
2022-10-13 15:57:56 +11:00
self . dof_model = dof_model
2022-10-17 21:41:19 +11:00
#: Date and time of fitting the model (Python *datetime*)
2022-10-13 15:57:56 +11:00
self . fitted_dt = fitted_dt
2022-10-17 21:41:19 +11:00
#: Method for computing the covariance matrix (*str*)
2022-10-17 20:34:58 +11:00
self . cov_type = cov_type
2022-10-13 15:57:56 +11:00
2022-10-14 21:57:22 +11:00
# Regression coefficients/p values
2022-10-17 21:41:19 +11:00
#: Coefficients and *p* values for each term in the model (*dict* of :class:`SingleTerm` or :class:`CategoricalTerm`)
2022-10-14 21:57:22 +11:00
self . terms = terms
2022-10-13 15:57:56 +11:00
# Model log-likelihood
2022-10-17 21:41:19 +11:00
#: Log-likelihood of fitted model (*float*)
2022-12-03 02:02:34 +11:00
self . ll_model = ll_model
2022-10-17 21:41:19 +11:00
#: Log-likelihood of null model (*float*)
2022-12-03 02:02:34 +11:00
self . ll_null = ll_null
2022-10-13 15:57:56 +11:00
# Extra statistics (not all regression models have these)
2022-10-17 21:41:19 +11:00
#: Degrees of freedom for the residuals (*int*; *None* if N/A)
2022-10-13 15:57:56 +11:00
self . dof_resid = dof_resid
2022-10-17 21:41:19 +11:00
#: *R*:sup:`2` statistic (*float*; *None* if N/A)
2022-10-13 15:57:56 +11:00
self . rsquared = rsquared
2022-10-17 21:41:19 +11:00
#: *F* statistic (*float*; *None* if N/A)
2022-10-13 15:57:56 +11:00
self . f_statistic = f_statistic
2022-12-02 12:53:40 +11:00
#: Comments for the model (*List[str]*)
self . comments = comments or [ ]
2022-10-13 15:57:56 +11:00
# Config for display style
2022-10-17 21:41:19 +11:00
#: See :func:`yli.regress`
2022-10-13 15:57:56 +11:00
self . exp = exp
@property
def pseudo_rsquared ( self ) :
2022-10-17 21:41:19 +11:00
""" McFadden ' s pseudo *R*:sup:`2` statistic """
2022-10-13 15:57:56 +11:00
2022-12-03 02:02:34 +11:00
return 1 - self . ll_model / self . ll_null
2022-10-13 15:57:56 +11:00
def lrtest_null ( self ) :
2022-10-17 21:41:19 +11:00
"""
Compute the likelihood ratio test comparing the model to the null model
: rtype : : class : ` LikelihoodRatioTestResult `
"""
2022-10-13 15:57:56 +11:00
2022-12-03 02:02:34 +11:00
statistic = - 2 * ( self . ll_null - self . ll_model )
2022-10-13 15:57:56 +11:00
pvalue = 1 - stats . chi2 . cdf ( statistic , self . dof_model )
return LikelihoodRatioTestResult ( statistic , self . dof_model , pvalue )
def ftest ( self ) :
2022-10-17 21:41:19 +11:00
"""
Perform the * F * test that all slopes are 0
: rtype : : class : ` yli . sig_tests . FTestResult `
"""
2022-10-13 15:57:56 +11:00
pvalue = 1 - stats . f ( self . dof_model , self . dof_resid ) . cdf ( self . f_statistic )
return FTestResult ( self . f_statistic , self . dof_model , self . dof_resid , pvalue )
2022-10-14 14:09:29 +11:00
def bayesfactor_beta_zero ( self , term ) :
"""
2022-10-17 21:41:19 +11:00
Compute a Bayes factor testing the hypothesis that the given beta is 0
2022-10-16 02:49:43 +11:00
2022-10-17 21:41:19 +11:00
Uses the R * BFpack * library .
Requires the regression to be from statsmodels .
The term must be specified as the * raw name * from the statsmodels regression , available via : attr : ` RegressionResult . raw_result ` .
: param term : Raw name of the term to be tested
: type term : str
: rtype : : class : ` yli . bayes_factors . BayesFactor `
2022-10-14 14:09:29 +11:00
"""
2022-10-16 02:49:43 +11:00
# FIXME: Allow specifying our renamed terms
2022-10-14 14:09:29 +11:00
# Get parameters required for AFBF
2022-10-16 02:49:43 +11:00
params = pd . Series ( { raw_name . replace ( ' [ ' , ' _ ' ) . replace ( ' ] ' , ' _ ' ) : beta for raw_name , beta in self . raw_result . params . items ( ) } )
2022-10-14 14:09:29 +11:00
cov = self . raw_result . cov_params ( )
# Compute BF matrix
bf01 = bayesfactor_afbf ( params , cov , self . nobs , ' {} = 0 ' . format ( term . replace ( ' [ ' , ' _ ' ) . replace ( ' ] ' , ' _ ' ) ) )
bf01 = BayesFactor ( bf01 . factor , ' 0 ' , ' {} = 0 ' . format ( term ) , ' 1 ' , ' {} ≠ 0 ' . format ( term ) )
if bf01 . factor > = 1 :
return bf01
else :
return bf01 . invert ( )
2022-12-02 20:46:00 +11:00
def brant ( self ) :
2022-12-02 22:03:25 +11:00
"""
Perform the Brant test for the parallel regression assumption in ordinal regression
Applicable when the model was fitted using : class : ` OrdinalLogit ` .
: rtype : : class : ` BrantResult `
* * Example : * *
. . code - block : :
df = pd . DataFrame ( . . . )
model = yli . regress ( yli . OrdinalLogit , df , ' apply ' , ' pared + public + gpa ' , exp = False )
model . brant ( )
. . code - block : : text
χ² df p
Omnibus 4.34 3 0.23
pared 0.13 1 0.72
public 3.44 1 0.06
gpa 0.18 1 0.67
The output shows the result of the Brant test . For example , for the omnibus test of the parallel regression assumption across all independent variables , the * χ * : sup : ` 2 ` statistic is 4.34 , the * χ * : sup : ` 2 ` distribution has 3 degrees of freedom , and the test is not significant , with * p * value 0.23 .
* * Reference : * * Brant R . Assessing proportionality in the proportional odds model for ordinal logistic regression . * Biometrics * . 1990 ; 46 ( 4 ) : 1171 – 8. ` doi : 10.2307 / 2532457 < https : / / doi . org / 10.2307 / 2532457 > ` _
"""
2022-12-02 20:46:00 +11:00
df = self . df ( )
if df is None :
raise Exception ( ' Referenced DataFrame has been dropped ' )
dep = self . dep
# Check for/clean NaNs
2022-12-02 21:01:07 +11:00
# NaN warning/error will already have been handled in regress, so here we pass nan_policy='omit'
2022-12-02 20:46:00 +11:00
# Following this, we pass nan_policy='raise' to assert no NaNs remaining
df = df [ [ dep ] + cols_for_formula ( self . formula , df ) ]
2022-12-02 21:01:07 +11:00
df = check_nan ( df , ' omit ' )
2022-12-02 20:46:00 +11:00
# Ensure numeric type for dependent variable
2022-12-02 21:42:41 +11:00
df [ dep ] , dep_categories = as_numeric ( df [ dep ] )
2022-12-02 20:46:00 +11:00
# Convert pandas nullable types for independent variables as this breaks statsmodels
df = convert_pandas_nullable ( df )
# Precompute design matrix for RHS
# This is also X+ in Brant paper
dmatrix_right = patsy . dmatrix ( self . formula , df , return_type = ' dataframe ' )
dmatrix_right . reset_index ( drop = True , inplace = True ) # Otherwise this confuses matrix multiplication
# Fit individual logistic regressions
logit_models = [ ]
2022-12-03 02:02:34 +11:00
for upper_limit in sorted ( df [ dep ] . unique ( ) ) [ : - 1 ] :
2022-12-02 20:46:00 +11:00
dep_dichotomous = ( df [ dep ] < = upper_limit ) . astype ( int ) . reset_index ( drop = True )
2022-12-02 21:01:07 +11:00
logit_result = sm . Logit ( dep_dichotomous , dmatrix_right ) . fit ( disp = False , * * self . fit_kwargs )
2022-12-02 20:46:00 +11:00
if not logit_result . mle_retvals [ ' converged ' ] :
raise Exception ( ' Maximum likelihood estimation failed to converge for {} <= {} . Check raw_result.mle_retvals. ' . format ( dep , upper_limit ) )
if pd . isna ( logit_result . bse ) . any ( ) :
raise Exception ( ' Regression returned NaN standard errors for {} <= {} . ' . format ( dep , upper_limit ) )
logit_models . append ( logit_result )
logit_betas = np . array ( [ model . _results . params for model in logit_models ] ) . T
logit_pihat = np . array ( [ expit ( - model . fittedvalues ) for model in logit_models ] ) . T # Predicted probabilities
# vcov is the variance-covariance matrix of all individually fitted betas across all terms
# | model 1 | model 2 | model 3 | ...
# | term 1 | term 2 | term 1 | term 2 | term 1 | term 2 | ...
# model 1 | term 1 |
# | term 2 |
# model 2 | term 1 |
# | term 2 |
# ...
n_terms = len ( dmatrix_right . columns ) - 1 # number of beta terms (excluding intercept)
n_betas = len ( logit_models ) * n_terms
vcov = np . zeros ( ( n_betas , n_betas ) )
# Populate the variance-covariance matrix for comparisons between individually fitted models
for j in range ( 0 , len ( logit_models ) - 1 ) :
for l in range ( j + 1 , len ( logit_models ) ) :
Wjj = np . diag ( logit_pihat [ : , j ] - logit_pihat [ : , j ] * logit_pihat [ : , j ] )
Wjl = np . diag ( logit_pihat [ : , l ] - logit_pihat [ : , j ] * logit_pihat [ : , l ] )
Wll = np . diag ( logit_pihat [ : , l ] - logit_pihat [ : , l ] * logit_pihat [ : , l ] )
matrix_result = np . linalg . inv ( dmatrix_right . T @ Wjj @ dmatrix_right ) @ dmatrix_right . T @ Wjl @ dmatrix_right @ np . linalg . inv ( dmatrix_right . T @ Wll @ dmatrix_right )
j_vs_l_vcov = np . asarray ( matrix_result ) [ 1 : , 1 : ] # Asymptotic covariance for j,l
vcov [ j * n_terms : ( j + 1 ) * n_terms , l * n_terms : ( l + 1 ) * n_terms ] = j_vs_l_vcov
vcov [ l * n_terms : ( l + 1 ) * n_terms , j * n_terms : ( j + 1 ) * n_terms ] = j_vs_l_vcov
# Populate the variance-covariance matrix within each individually fitted model
for i in range ( len ( logit_models ) ) :
vcov [ i * n_terms : ( i + 1 ) * n_terms , i * n_terms : ( i + 1 ) * n_terms ] = logit_models [ i ] . _results . cov_params ( ) [ 1 : , 1 : ]
# ------------------
# Perform Wald tests
beta_names = [ ' {} _ {} ' . format ( raw_name , i ) for i in range ( len ( logit_models ) ) for raw_name in dmatrix_right . columns [ 1 : ] ]
wald_results = { }
# Omnibus test
constraints = [ ' = ' . join ( ' {} _ {} ' . format ( raw_name , i ) for i in range ( len ( logit_models ) ) ) for raw_name in dmatrix_right . columns [ 1 : ] ]
constraint = ' , ' . join ( constraints )
df = ( len ( logit_models ) - 1 ) * ( len ( dmatrix_right . columns ) - 1 ) # df = (number of levels minus 2) * (number of terms excluding intercept)
wald_result = _wald_test ( beta_names , logit_betas [ 1 : ] . ravel ( ' F ' ) , constraint , vcov , df )
wald_results [ ' Omnibus ' ] = ChiSquaredResult ( wald_result . statistic , wald_result . df_denom , wald_result . pvalue )
# Individual terms
for raw_name in dmatrix_right . columns [ 1 : ] :
constraint = ' = ' . join ( ' {} _ {} ' . format ( raw_name , i ) for i in range ( len ( logit_models ) ) )
df = len ( logit_models ) - 1 # df = (number of levels minus 2)
wald_result = _wald_test ( beta_names , logit_betas [ 1 : ] . ravel ( ' F ' ) , constraint , vcov , df )
wald_results [ raw_name ] = ChiSquaredResult ( wald_result . statistic , wald_result . df_denom , wald_result . pvalue )
return BrantResult ( wald_results )
2022-12-02 21:01:07 +11:00
def bootstrap ( self , samples = 1000 ) :
"""
2022-12-02 21:07:08 +11:00
Use bootstrapping to recompute confidence intervals and * p * values for the terms in the regression model
2022-12-02 21:01:07 +11:00
: param samples : Number of bootstrap samples to draw
: type samples : int
: rtype : : class : ` yli . regress . RegressionResult `
"""
df = self . df ( )
if df is None :
raise Exception ( ' Referenced DataFrame has been dropped ' )
dep = self . dep
# Check for/clean NaNs
# NaN warning/error will already have been handled in regress, so here we pass nan_policy='omit'
# Following this, we pass nan_policy='raise' to assert no NaNs remaining
df = df [ [ dep ] + cols_for_formula ( self . formula , df ) ]
df = check_nan ( df , ' omit ' )
# Ensure numeric type for dependent variable
2022-12-02 21:42:41 +11:00
df [ dep ] , dep_categories = as_numeric ( df [ dep ] )
2022-12-02 21:01:07 +11:00
# Convert pandas nullable types for independent variables as this breaks statsmodels
df = convert_pandas_nullable ( df )
# Precompute design matrices
dmatrices = patsy . dmatrices ( dep + ' ~ ' + self . formula , df , return_type = ' dataframe ' )
# Fit full model
#full_model = regress(self.model_class, df, dep, self.formula, nan_policy='raise', _dmatrices=dmatrices, model_kwargs=self.model_kwargs, fit_kwargs=self.fit_kwargs)
# Initialise bootstrap_results
bootstrap_results = { } # Dict mapping term raw names to bootstrap betas
for term in self . terms . values ( ) :
if isinstance ( term , SingleTerm ) :
bootstrap_results [ term . raw_name ] = [ ]
else :
for sub_term in term . categories . values ( ) :
bootstrap_results [ sub_term . raw_name ] = [ ]
# Draw bootstrap samples and regress
dmatrices = dmatrices [ 0 ] . join ( dmatrices [ 1 ] )
for i in tqdm ( range ( samples ) ) :
bootstrap_rows = dmatrices . sample ( len ( df ) , replace = True )
model = self . model_class ( endog = bootstrap_rows . iloc [ : , 0 ] , exog = bootstrap_rows . iloc [ : , 1 : ] , * * self . model_kwargs )
model . formula = dep + ' ~ ' + self . formula
result = model . fit ( disp = False , * * self . fit_kwargs )
for raw_name , raw_beta in zip ( model . exog_names , result . _results . params ) :
bootstrap_results [ raw_name ] . append ( raw_beta )
# Combine bootstrap results
terms = { }
for term_name , term in self . terms . items ( ) :
if isinstance ( term , SingleTerm ) :
bootstrap_betas = bootstrap_results [ term . raw_name ]
bootstrap_pvalue = sum ( 1 for b in bootstrap_betas if b < 0 ) / len ( bootstrap_betas )
bootstrap_pvalue = 2 * min ( bootstrap_pvalue , 1 - bootstrap_pvalue )
terms [ term_name ] = SingleTerm ( term . raw_name , Estimate ( term . beta . point , np . quantile ( bootstrap_betas , config . alpha / 2 ) , np . quantile ( bootstrap_betas , 1 - config . alpha / 2 ) ) , bootstrap_pvalue )
else :
categories = { }
for sub_term_name , sub_term in term . categories . items ( ) :
bootstrap_betas = bootstrap_results [ sub_term . raw_name ]
bootstrap_pvalue = sum ( 1 for b in bootstrap_betas if b < 0 ) / len ( bootstrap_betas )
bootstrap_pvalue = 2 * min ( bootstrap_pvalue , 1 - bootstrap_pvalue )
categories [ sub_term_name ] = SingleTerm ( sub_term . raw_name , Estimate ( sub_term . beta . point , np . quantile ( bootstrap_betas , config . alpha / 2 ) , np . quantile ( bootstrap_betas , 1 - config . alpha / 2 ) ) , bootstrap_pvalue )
terms [ term_name ] = CategoricalTerm ( categories , term . ref_category )
return RegressionResult (
self . model_class , self . df , dep , self . formula , self . nan_policy , self . model_kwargs , self . fit_kwargs ,
None ,
self . full_name , self . model_name , self . fit_method ,
2023-02-25 14:46:13 +11:00
self . nobs , None , self . dof_model , datetime . now ( ) , ' Bootstrap ' ,
2022-12-02 21:01:07 +11:00
terms ,
2022-12-03 02:02:34 +11:00
self . ll_model , self . ll_null ,
2022-12-02 21:01:07 +11:00
self . dof_resid , self . rsquared , self . f_statistic ,
self . comments ,
self . exp
)
2023-02-07 18:50:07 +11:00
def shap ( self , * * kwargs ) :
2023-02-25 23:46:48 +11:00
"""
Compute SHAP values for the model
Uses the Python * shap * library .
: param kwargs : Keyword arguments to pass to * shap . LinearExplainer *
: rtype : : class : ` yli . shap . ShapResult `
* * Reference : * * Lundberg SM , Lee SI . A unified approach to interpreting model predictions . In : Guyon I , Von Luxburg U , Bengio S , et al . , editors . * Advances in Neural Information Processing Systems * ; 2017 Dec 4 – 9 ; Long Beach , CA . https : / / proceedings . neurips . cc / paper / 2017 / hash / 8 a20a8621978632d76c43dfd28b67767 - Abstract . html
"""
2023-02-07 18:50:07 +11:00
import shap
xdata = ShapResult . _get_xdata ( self )
# Combine terms into single list
params = [ ]
for term in self . terms . values ( ) :
if isinstance ( term , SingleTerm ) :
params . append ( term . beta . point )
else :
params . extend ( s . beta . point for s in term . categories . values ( ) )
explainer = shap . LinearExplainer ( ( np . array ( params [ 1 : ] ) , params [ 0 ] ) , xdata , * * kwargs ) # FIXME: Assumes zeroth term is intercept
shap_values = explainer . shap_values ( xdata ) . astype ( ' float ' )
return ShapResult ( weakref . ref ( self ) , shap_values , list ( xdata . columns ) )
2022-10-13 15:57:56 +11:00
def _header_table ( self , html ) :
""" Return the entries for the header table """
# Left column
left_col = [ ]
left_col . append ( ( ' Dep. Variable: ' , self . dep ) )
left_col . append ( ( ' Model: ' , self . model_name ) )
left_col . append ( ( ' Method: ' , self . fit_method ) )
left_col . append ( ( ' Date: ' , self . fitted_dt . strftime ( ' % Y- % m- %d ' ) ) )
left_col . append ( ( ' Time: ' , self . fitted_dt . strftime ( ' % H: % M: % S ' ) ) )
2022-12-02 12:53:40 +11:00
if self . cov_type :
left_col . append ( ( ' Std. Errors: ' , ' Non-Robust ' if self . cov_type == ' nonrobust ' else self . cov_type . upper ( ) if self . cov_type . startswith ( ' hc ' ) else self . cov_type ) )
2022-10-13 15:57:56 +11:00
# Right column
right_col = [ ]
2022-10-17 20:34:58 +11:00
right_col . append ( ( ' No. Observations: ' , format ( self . nobs , ' .0f ' ) ) )
2023-02-25 14:46:13 +11:00
if self . nevents :
right_col . append ( ( ' No. Events: ' , format ( self . nevents , ' .0f ' ) ) )
2022-10-16 02:30:24 +11:00
right_col . append ( ( ' Df. Model: ' , format ( self . dof_model , ' .0f ' ) ) )
2022-10-13 15:57:56 +11:00
if self . dof_resid :
right_col . append ( ( ' Df. Residuals: ' , format ( self . dof_resid , ' .0f ' ) ) )
if self . rsquared :
right_col . append ( ( ' <i>R</i><sup>2</sup>: ' if html else ' R²: ' , format ( self . rsquared , ' .2f ' ) ) )
2023-02-25 14:46:13 +11:00
elif self . ll_null :
2022-10-13 15:57:56 +11:00
right_col . append ( ( ' Pseudo <i>R</i><sup>2</sup>: ' if html else ' Pseudo R²: ' , format ( self . pseudo_rsquared , ' .2f ' ) ) )
if self . f_statistic :
# Report the F test if available
f_result = self . ftest ( )
if html :
right_col . append ( ( ' <i>F</i>: ' , format ( f_result . statistic , ' .2f ' ) ) )
2022-11-09 17:39:33 +11:00
right_col . append ( ( ' <i>p</i> (<i>F</i>): ' , fmt_p ( f_result . pvalue , PValueStyle . VALUE_ONLY | PValueStyle . HTML ) ) )
2022-10-13 15:57:56 +11:00
else :
right_col . append ( ( ' F: ' , format ( f_result . statistic , ' .2f ' ) ) )
2022-11-09 17:39:33 +11:00
right_col . append ( ( ' p (F): ' , fmt_p ( f_result . pvalue , PValueStyle . VALUE_ONLY ) ) )
2023-02-25 15:07:42 +11:00
else :
2022-10-13 15:57:56 +11:00
# Otherwise report likelihood ratio test as overall test
2022-12-03 02:02:34 +11:00
right_col . append ( ( ' LL-Model: ' , format ( self . ll_model , ' .2f ' ) ) )
2023-02-25 15:07:42 +11:00
if self . ll_null :
lrtest_result = self . lrtest_null ( )
right_col . append ( ( ' LL-Null: ' , format ( self . ll_null , ' .2f ' ) ) )
if html :
right_col . append ( ( ' <i>p</i> (LR): ' , fmt_p ( lrtest_result . pvalue , PValueStyle . VALUE_ONLY | PValueStyle . HTML ) ) )
else :
right_col . append ( ( ' p (LR): ' , fmt_p ( lrtest_result . pvalue , PValueStyle . VALUE_ONLY ) ) )
2022-10-13 15:57:56 +11:00
return left_col , right_col
2022-10-18 18:44:04 +11:00
def __repr__ ( self ) :
if config . repr_is_summary :
return self . summary ( )
return super ( ) . __repr__ ( )
2022-10-13 15:57:56 +11:00
def _repr_html_ ( self ) :
# Render header table
left_col , right_col = self . _header_table ( html = True )
out = ' <table><caption> {} Results</caption> ' . format ( self . full_name )
for left_cell , right_cell in itertools . zip_longest ( left_col , right_col ) :
out + = ' <tr><th> {} </th><td> {} </td><th> {} </th><td> {} </td></tr> ' . format (
left_cell [ 0 ] if left_cell else ' ' ,
left_cell [ 1 ] if left_cell else ' ' ,
right_cell [ 0 ] if right_cell else ' ' ,
right_cell [ 1 ] if right_cell else ' '
)
out + = ' </table> '
# Render coefficients table
2022-10-15 23:24:29 +11:00
out + = ' <table><tr><th></th><th style= " text-align:center " > {} </th><th colspan= " 3 " style= " text-align:center " >( {:g} % CI)</th><th style= " text-align:center " ><i>p</i></th></tr> ' . format ( ' exp(<i>β</i>) ' if self . exp else ' <i>β</i> ' , ( 1 - config . alpha ) * 100 )
2022-10-13 15:57:56 +11:00
2022-10-14 21:57:22 +11:00
for term_name , term in self . terms . items ( ) :
if isinstance ( term , SingleTerm ) :
# Single term
# Exponentiate beta if requested
beta = term . beta
if self . exp :
beta = np . exp ( beta )
2022-11-09 17:39:33 +11:00
out + = ' <tr><th> {} </th><td> {:.2f} </td><td style= " padding-right:0 " >( {:.2f} </td><td>–</td><td style= " padding-left:0 " > {:.2f} )</td><td style= " text-align:left " > {} </td></tr> ' . format ( term_name , beta . point , beta . ci_lower , beta . ci_upper , fmt_p ( term . pvalue , PValueStyle . TABULAR | PValueStyle . HTML ) )
2022-10-14 21:57:22 +11:00
elif isinstance ( term , CategoricalTerm ) :
# Categorical term
out + = ' <tr><th> {} </th><td></td><td style= " padding-right:0 " ></td><td></td><td style= " padding-left:0 " ></td><td></td></tr> ' . format ( term_name )
# Render reference category
2022-12-02 20:19:08 +11:00
if term . ref_category is not None :
out + = ' <tr><td style= " text-align:right;font-style:italic " > {} </td><td>Ref.</td><td style= " padding-right:0 " ></td><td></td><td style= " padding-left:0 " ></td><td></td></tr> ' . format ( term . ref_category )
2022-10-14 21:57:22 +11:00
# Loop over terms
for sub_term_name , sub_term in term . categories . items ( ) :
# Exponentiate beta if requested
beta = sub_term . beta
if self . exp :
beta = np . exp ( beta )
2022-11-09 17:39:33 +11:00
out + = ' <tr><td style= " text-align:right;font-style:italic " > {} </td><td> {:.2f} </td><td style= " padding-right:0 " >( {:.2f} </td><td>–</td><td style= " padding-left:0 " > {:.2f} )</td><td style= " text-align:left " > {} </td></tr> ' . format ( sub_term_name , beta . point , beta . ci_lower , beta . ci_upper , fmt_p ( sub_term . pvalue , PValueStyle . TABULAR | PValueStyle . HTML ) )
2022-10-14 21:57:22 +11:00
else :
raise Exception ( ' Attempt to render unknown term type ' )
2022-10-13 15:57:56 +11:00
out + = ' </table> '
# TODO: Have a detailed view which shows SE, t/z, etc.
2022-12-02 12:53:40 +11:00
if self . comments :
out + = ' <ol> '
for comment in self . comments :
out + = ' <li> {} </li> ' . format ( comment )
out + = ' </ol> '
2022-10-13 15:57:56 +11:00
return out
def summary ( self ) :
2022-10-17 21:41:19 +11:00
"""
Return a stringified summary of the regression model
: rtype : str
"""
2022-10-13 15:57:56 +11:00
# Render header table
left_col , right_col = self . _header_table ( html = False )
# Ensure equal sizes for SimpleTable
if len ( right_col ) > len ( left_col ) :
left_col . extend ( [ ( ' ' , ' ' ) ] * ( len ( right_col ) - len ( left_col ) ) )
elif len ( left_col ) > len ( right_col ) :
right_col . extend ( [ ( ' ' , ' ' ) ] * ( len ( left_col ) - len ( right_col ) ) )
table1 = SimpleTable ( np . concatenate ( [ left_col , right_col ] , axis = 1 ) , title = ' {} Results ' . format ( self . full_name ) )
table1 . insert_stubs ( 2 , [ ' | ' ] * len ( left_col ) )
# Get rid of last line (merge with next table)
table1_lines = table1 . as_text ( ) . splitlines ( keepends = False )
out = ' \n ' . join ( table1_lines [ : - 1 ] ) + ' \n '
# Render coefficients table
table_data = [ ]
2022-10-14 21:57:22 +11:00
for term_name , term in self . terms . items ( ) :
if isinstance ( term , SingleTerm ) :
# Single term
# Exponentiate beta if requested
beta = term . beta
if self . exp :
beta = np . exp ( beta )
# Add some extra padding
2022-11-09 17:39:33 +11:00
table_data . append ( [ term_name + ' ' , format ( beta . point , ' .2f ' ) , ' ( {:.2f} ' . format ( beta . ci_lower ) , ' - ' , ' {:.2f} ) ' . format ( beta . ci_upper ) , ' ' + fmt_p ( term . pvalue , PValueStyle . TABULAR ) ] )
2022-10-14 21:57:22 +11:00
elif isinstance ( term , CategoricalTerm ) :
# Categorical term
table_data . append ( [ term_name + ' ' , ' ' , ' ' , ' ' , ' ' , ' ' ] )
# Render reference category
2022-12-02 20:19:08 +11:00
if term . ref_category is not None :
table_data . append ( [ ' {} ' . format ( term . ref_category ) , ' Ref. ' , ' ' , ' ' , ' ' , ' ' ] )
2022-10-14 21:57:22 +11:00
# Loop over terms
for sub_term_name , sub_term in term . categories . items ( ) :
# Exponentiate beta if requested
beta = sub_term . beta
if self . exp :
beta = np . exp ( beta )
2022-11-09 17:39:33 +11:00
table_data . append ( [ sub_term_name + ' ' , format ( beta . point , ' .2f ' ) , ' ( {:.2f} ' . format ( beta . ci_lower ) , ' - ' , ' {:.2f} ) ' . format ( beta . ci_upper ) , ' ' + fmt_p ( sub_term . pvalue , PValueStyle . TABULAR ) ] )
2022-10-14 21:57:22 +11:00
else :
raise Exception ( ' Attempt to render unknown term type ' )
2022-10-13 15:57:56 +11:00
table2 = SimpleTable ( data = table_data , headers = [ ' ' , ' exp(β) ' if self . exp else ' β ' , ' ' , ' \ue000 ' , ' ' , ' p ' ] ) # U+E000 is in Private Use Area, mark middle of CI column
2022-10-15 23:24:29 +11:00
table2_text = table2 . as_text ( ) . replace ( ' \ue000 ' , ' ( {:g} % CI) ' . format ( ( 1 - config . alpha ) * 100 ) ) # Render heading in the right spot
2022-10-13 15:57:56 +11:00
table2_lines = table2_text . splitlines ( keepends = False )
# Render divider line between 2 tables
max_table_len = max ( len ( table1_lines [ - 1 ] ) , len ( table2_lines [ - 1 ] ) )
out + = ' = ' * max_table_len + ' \n '
out + = ' \n ' . join ( table2_lines [ 1 : ] )
2022-12-02 12:53:40 +11:00
if self . comments :
out + = ' \n '
for i , comment in enumerate ( self . comments ) :
out + = ' \n {} . {} ' . format ( i + 1 , comment )
2022-10-13 15:57:56 +11:00
return out
2022-10-14 21:57:22 +11:00
class SingleTerm :
2022-10-17 21:41:19 +11:00
""" A term in a :class:`RegressionResult` which is a single term """
2022-10-14 21:57:22 +11:00
def __init__ ( self , raw_name , beta , pvalue ) :
2022-10-17 21:41:19 +11:00
#: Raw name of the term (*str*; e.g. in :attr:`RegressionResult.raw_result`)
2022-10-14 21:57:22 +11:00
self . raw_name = raw_name
2022-10-17 21:41:19 +11:00
#: :class:`yli.utils.Estimate` of the coefficient
2022-10-14 21:57:22 +11:00
self . beta = beta
2022-10-17 21:41:19 +11:00
#: *p* value for the coefficient (*float*)
2022-10-14 21:57:22 +11:00
self . pvalue = pvalue
class CategoricalTerm :
2022-10-17 21:41:19 +11:00
""" A group of terms in a :class:`RegressionResult` corresponding to a categorical variable """
2022-10-14 21:57:22 +11:00
def __init__ ( self , categories , ref_category ) :
2022-10-17 21:41:19 +11:00
#: Terms for each of the categories, excluding the reference category (*dict* of :class:`SingleTerm`)
2022-10-14 21:57:22 +11:00
self . categories = categories
2022-10-17 21:41:19 +11:00
#: Name of the reference category (*str*)
2022-10-14 21:57:22 +11:00
self . ref_category = ref_category
2022-10-13 15:57:56 +11:00
def regress (
model_class , df , dep , formula , * ,
2022-10-15 01:42:06 +11:00
nan_policy = ' warn ' ,
2022-10-16 02:31:08 +11:00
model_kwargs = None , fit_kwargs = None ,
2023-02-25 14:46:13 +11:00
family = None , exposure = None , status = None , # common model_kwargs
2022-12-02 12:54:03 +11:00
cov_type = None , method = None , maxiter = None , start_params = None , # common fit_kwargs
2022-11-29 14:40:50 +11:00
bool_baselevels = False , exp = None ,
2022-11-29 23:00:14 +11:00
_dmatrices = None ,
2022-10-13 15:57:56 +11:00
) :
2022-10-15 01:42:06 +11:00
"""
Fit a statsmodels regression model
2022-10-17 21:41:19 +11:00
: param model_class : Type of regression model to fit
: type model_class : statsmodels model class
: param df : Data to perform regression on
: type df : DataFrame
: param dep : Column in * df * for the dependent variable ( numeric )
: type dep : str
: param formula : Patsy formula for the regression model
: type formula : str
2023-02-25 14:12:44 +11:00
: param exposure : Column in * df * for the exposure variable ( numeric , some models only )
: type exposure : str
2023-02-25 14:46:13 +11:00
: param status : Column in * df * for the status variable ( time - to - event models only )
: type status : str
2022-10-17 21:41:19 +11:00
: param nan_policy : How to handle * nan * values ( see : ref : ` nan - handling ` )
: type nan_policy : str
: param model_kwargs : Keyword arguments to pass to * model_class * constructor
: type model_kwargs : dict
: param fit_kwargs : Keyword arguments to pass to * model . fit *
: type fit_kwargs : dict
: param family : See statsmodels * GLM * constructor
: param cov_type : See statsmodels * model . fit *
2022-12-02 12:54:03 +11:00
: param method : See statsmodels * model . fit *
2022-10-17 21:41:19 +11:00
: param maxiter : See statsmodels * model . fit *
: param start_params : See statsmodels * model . fit *
: param bool_baselevels : Show reference categories for boolean independent variables even if reference category is * False *
: type bool_baselevels : bool
: param exp : Report exponentiated parameters rather than raw parameters , default ( * None * ) is to autodetect based on * model_class *
: type exp : bool
: rtype : : class : ` yli . regress . RegressionResult `
2022-10-18 19:25:12 +11:00
* * Example : * *
. . code - block : :
df = pd . DataFrame ( {
' Unhealthy ' : [ False , False , False , . . . ] ,
' Fibrinogen ' : [ 2.52 , 2.46 , 2.29 , . . . ] ,
' GammaGlobulin ' : [ 38 , 36 , 36 , . . . ]
} )
yli . regress ( sm . Logit , df , ' Unhealthy ' , ' Fibrinogen + GammaGlobulin ' )
. . code - block : : text
Logistic Regression Results
== == == == == == == == == == == == == == == == == == == == == == == == == == ==
Dep . Variable : Unhealthy | No . Observations : 32
Model : Logit | Df . Model : 2
Method : MLE | Df . Residuals : 29
Date : 2022 - 10 - 18 | Pseudo R² : 0.26
Time : 19 : 00 : 34 | LL - Model : - 11.47
Std . Errors : Non - Robust | LL - Null : - 15.44
| p ( LR ) : 0.02 *
== == == == == == == == == == == == == == == == == == == == == == == == == == ==
exp ( β ) ( 95 % CI ) p
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
( Intercept ) 0.00 ( 0.00 - 0.24 ) 0.03 *
Fibrinogen 6.80 ( 1.01 - 45.79 ) 0.049 *
GammaGlobulin 1.17 ( 0.92 - 1.48 ) 0.19
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
The output summarises the results of the regression .
Note that the parameter estimates are automatically exponentiated .
For example , the odds ratio for unhealthiness per unit increase in fibrinogen is 6.80 , with 95 % confidence interval 1.01 – 45.79 , and is significant with * p * value 0.049 .
2022-10-15 01:42:06 +11:00
"""
2022-10-13 15:57:56 +11:00
2022-10-16 02:31:08 +11:00
# Populate model_kwargs
if model_kwargs is None :
model_kwargs = { }
if family is not None :
model_kwargs [ ' family ' ] = family
# Populate fit_kwargs
if fit_kwargs is None :
fit_kwargs = { }
if cov_type is not None :
fit_kwargs [ ' cov_type ' ] = cov_type
2022-12-02 12:54:03 +11:00
if method is not None :
fit_kwargs [ ' method ' ] = method
2022-10-16 02:31:08 +11:00
if maxiter is not None :
fit_kwargs [ ' maxiter ' ] = maxiter
if start_params is not None :
fit_kwargs [ ' start_params ' ] = start_params
2022-10-13 15:57:56 +11:00
# Autodetect whether to exponentiate
if exp is None :
2023-02-25 14:46:13 +11:00
if model_class in ( sm . Logit , sm . PHReg , sm . Poisson , OrdinalLogit , PenalisedLogit ) :
2022-12-02 12:53:40 +11:00
exp = True
2022-10-13 15:57:56 +11:00
else :
exp = False
2022-12-02 20:46:00 +11:00
df_ref = weakref . ref ( df )
2022-11-29 14:40:50 +11:00
if _dmatrices is None :
2023-02-25 14:46:13 +11:00
# Check for/clean NaNs in input columns
columns = [ dep ] + cols_for_formula ( formula , df )
if exposure is not None :
columns . append ( exposure )
if status is not None :
columns . append ( status )
df = df [ columns ]
2022-11-29 14:40:50 +11:00
df = check_nan ( df , nan_policy )
# Ensure numeric type for dependent variable
2022-12-02 21:42:41 +11:00
df [ dep ] , dep_categories = as_numeric ( df [ dep ] )
2022-11-29 14:40:50 +11:00
# Convert pandas nullable types for independent variables as this breaks statsmodels
df = convert_pandas_nullable ( df )
# Construct design matrix from formula
dmatrices = patsy . dmatrices ( dep + ' ~ ' + formula , df , return_type = ' dataframe ' )
else :
dmatrices = _dmatrices
2022-10-13 15:57:56 +11:00
2023-02-25 14:46:13 +11:00
if model_class in ( sm . PHReg , OrdinalLogit ) :
2022-12-02 12:53:40 +11:00
# Drop explicit intercept term
# FIXME: Check before dropping
dmatrices = ( dmatrices [ 0 ] , dmatrices [ 1 ] . iloc [ : , 1 : ] )
2023-02-25 14:46:13 +11:00
# Add exposure to model
2023-02-25 14:12:44 +11:00
if exposure is not None :
if df [ exposure ] . dtype == ' <m8[ns] ' :
model_kwargs [ ' exposure ' ] = df [ exposure ] . dt . total_seconds ( )
else :
model_kwargs [ ' exposure ' ] = df [ exposure ]
2023-02-25 14:46:13 +11:00
# Add status to model
if status is not None :
model_kwargs [ ' status ' ] = df [ status ]
2022-10-13 15:57:56 +11:00
# Fit model
2022-11-29 14:40:50 +11:00
model = model_class ( endog = dmatrices [ 0 ] , exog = dmatrices [ 1 ] , * * model_kwargs )
model . formula = dep + ' ~ ' + formula
2022-10-17 20:34:36 +11:00
result = model . fit ( disp = False , * * fit_kwargs )
2022-10-13 15:57:56 +11:00
2022-10-13 17:23:29 +11:00
if isinstance ( result , RegressionResult ) :
# Already processed!
2022-12-02 20:46:00 +11:00
result . model_class = model_class
result . df = df_ref
result . dep = dep
result . formula = formula
2022-12-02 21:01:07 +11:00
result . nan_policy = nan_policy
result . model_kwargs = model_kwargs
result . fit_kwargs = fit_kwargs
2022-10-13 17:23:29 +11:00
result . exp = exp
return result
2022-10-17 20:34:36 +11:00
# Check convergence
if hasattr ( result , ' mle_retvals ' ) and not result . mle_retvals [ ' converged ' ] :
warnings . warn ( ' Maximum likelihood estimation failed to converge. Check raw_result.mle_retvals. ' )
2022-10-14 21:57:22 +11:00
# Process terms
terms = { }
2022-11-29 14:40:50 +11:00
# Join term names manually because statsmodels wrapper is very slow
#confint = result.conf_int(config.alpha)
2023-02-25 14:46:13 +11:00
if hasattr ( result , ' _results ' ) :
confint = { k : v for k , v in zip ( model . exog_names , result . _results . conf_int ( config . alpha ) ) }
pvalues = { k : v for k , v in zip ( model . exog_names , result . _results . pvalues ) }
params = result . _results . params
else :
# e.g. PHReg
confint = { k : v for k , v in zip ( model . exog_names , result . conf_int ( config . alpha ) ) }
pvalues = { k : v for k , v in zip ( model . exog_names , result . pvalues ) }
params = result . params
2022-10-14 21:57:22 +11:00
2022-11-29 14:40:50 +11:00
#for raw_name, raw_beta in result.params.items():
2023-02-25 14:46:13 +11:00
for raw_name , raw_beta in zip ( model . exog_names , params ) :
2022-11-29 14:40:50 +11:00
beta = Estimate ( raw_beta , confint [ raw_name ] [ 0 ] , confint [ raw_name ] [ 1 ] )
2022-10-14 21:57:22 +11:00
# Rename terms
if raw_name == ' Intercept ' :
# Intercept term (single term)
term = ' (Intercept) '
2022-11-29 14:40:50 +11:00
terms [ term ] = SingleTerm ( raw_name , beta , pvalues [ raw_name ] )
2022-12-02 20:19:08 +11:00
elif model_class is OrdinalLogit and ' / ' in raw_name :
# Group ordinal regression cutoffs
if ' (Cutoffs) ' not in terms :
terms [ ' (Cutoffs) ' ] = CategoricalTerm ( { } , None )
2022-12-02 21:42:41 +11:00
if dep_categories is None :
term = raw_name
else :
# Need to convert factorised names back into original names
bits = raw_name . split ( ' / ' )
term = dep_categories [ round ( float ( bits [ 0 ] ) ) ] + ' / ' + dep_categories [ round ( float ( bits [ 1 ] ) ) ]
terms [ ' (Cutoffs) ' ] . categories [ term ] = SingleTerm ( raw_name , beta , pvalues [ raw_name ] )
2022-10-14 21:57:22 +11:00
else :
2022-10-15 01:09:40 +11:00
# Parse if required
factor , column , contrast = parse_patsy_term ( formula , df , raw_name )
if contrast is not None :
# Categorical term
2022-10-15 01:42:06 +11:00
if bool_baselevels is False and contrast == ' True ' and set ( df [ column ] . unique ( ) ) == set ( [ True , False ] ) :
# Treat as single term
2022-11-29 14:40:50 +11:00
terms [ column ] = SingleTerm ( raw_name , beta , pvalues [ raw_name ] )
2022-10-15 01:42:06 +11:00
else :
# Add a new categorical term if not exists
if column not in terms :
2022-11-29 23:00:14 +11:00
ref_category = formula_factor_ref_category ( formula , df , factor )
2022-10-15 01:42:06 +11:00
terms [ column ] = CategoricalTerm ( { } , ref_category )
2022-11-29 14:40:50 +11:00
terms [ column ] . categories [ contrast ] = SingleTerm ( raw_name , beta , pvalues [ raw_name ] )
2022-10-15 01:09:40 +11:00
else :
# Single term
2022-11-29 14:40:50 +11:00
terms [ column ] = SingleTerm ( raw_name , beta , pvalues [ raw_name ] )
2022-10-13 15:57:56 +11:00
2022-12-03 02:02:34 +11:00
# Fit null model (for ll_null)
if hasattr ( result , ' ll_null ' ) :
ll_null = result . ll_null
elif hasattr ( result , ' llnull ' ) :
ll_null = result . llnull
2023-02-25 14:46:13 +11:00
elif model_class is sm . PHReg :
2023-02-25 15:07:42 +11:00
ll_null = model . loglike ( [ 0 for _ in result . params ] )
2022-10-13 15:57:56 +11:00
else :
2022-11-29 14:40:50 +11:00
# Construct null (intercept-only) model
#result_null = model_class.from_formula(formula=dep + ' ~ 1', data=df).fit()
dm_exog = pd . DataFrame ( index = dmatrices [ 0 ] . index )
dm_exog [ ' Intercept ' ] = pd . Series ( dtype = ' float64 ' )
dm_exog [ ' Intercept ' ] . fillna ( 1 , inplace = True )
result_null = model_class ( endog = dmatrices [ 0 ] , exog = dm_exog ) . fit ( )
2022-12-03 02:02:34 +11:00
ll_null = result_null . llf
2022-10-13 15:57:56 +11:00
2022-11-29 14:40:50 +11:00
if model_class is sm . OLS :
method_name = ' Least Squares '
2023-02-25 14:46:13 +11:00
elif model_class is sm . PHReg :
method_name = ' MPLE '
2022-11-29 14:40:50 +11:00
else :
# Parse raw regression results to get fit method
# Avoid this in general as it can be expensive to summarise all the post hoc tests, etc.
header_entries = np . vectorize ( str . strip ) ( np . concatenate ( np . split ( np . array ( result . summary ( ) . tables [ 0 ] . data ) , 2 , axis = 1 ) ) )
header_dict = { x [ 0 ] : x [ 1 ] for x in header_entries }
method_name = header_dict [ ' Method: ' ]
2022-10-13 15:57:56 +11:00
2023-02-25 14:46:13 +11:00
# Get names to display
if model_class is sm . PHReg :
short_name = ' Cox '
else :
short_name = model_class . __name__
2022-10-16 02:31:23 +11:00
if model_class is sm . Logit :
full_name = ' Logistic Regression '
2022-12-02 21:43:05 +11:00
elif model_class is OrdinalLogit :
full_name = ' Ordinal Logistic Regression '
2022-10-16 02:31:23 +11:00
else :
2023-02-25 14:46:13 +11:00
full_name = ' {} Regression ' . format ( short_name )
2022-10-16 02:31:23 +11:00
if fit_kwargs . get ( ' cov_type ' , ' nonrobust ' ) != ' nonrobust ' :
full_name = ' Robust {} ' . format ( full_name )
2022-10-13 15:57:56 +11:00
return RegressionResult (
2022-12-02 21:01:07 +11:00
model_class , df_ref , dep , formula , nan_policy , model_kwargs , fit_kwargs ,
2022-10-13 15:57:56 +11:00
result ,
2023-02-25 14:46:13 +11:00
full_name , short_name , method_name ,
getattr ( result , ' nobs ' , len ( df ) ) , df [ status ] . sum ( ) if model_class is sm . PHReg else None , result . df_model , datetime . now ( ) , getattr ( result , ' cov_type ' , ' nonrobust ' ) ,
2022-10-14 21:57:22 +11:00
terms ,
2022-12-03 02:02:34 +11:00
result . llf , ll_null ,
2022-10-13 15:57:56 +11:00
getattr ( result , ' df_resid ' , None ) , getattr ( result , ' rsquared ' , None ) , getattr ( result , ' fvalue ' , None ) ,
2022-12-02 20:19:08 +11:00
[ ] ,
2022-10-13 15:57:56 +11:00
exp
)
2022-10-13 17:23:29 +11:00
2022-10-16 02:31:37 +11:00
def logit_then_regress ( model_class , df , dep , formula , * , nan_policy = ' warn ' , * * kwargs ) :
2022-10-17 21:41:19 +11:00
"""
Perform logistic regression , then use parameters as start parameters for desired regression
: param model_class : Type of regression model to fit
: type model_class : statsmodels model class
: param df : Data to perform regression on
: type df : DataFrame
: param dep : Column in * df * for the dependent variable ( numeric )
: type dep : str
: param formula : Patsy formula for the regression model
: type formula : str
: param nan_policy : How to handle * nan * values ( see : ref : ` nan - handling ` )
: type nan_policy : str
: param kwargs : Passed through to : func : ` yli . regress `
: rtype : : class : ` yli . regress . RegressionResult `
"""
2022-10-16 02:31:37 +11:00
# Check for/clean NaNs
# Do this once here so we only get 1 warning
df = df [ [ dep ] + cols_for_formula ( formula , df ) ]
df = check_nan ( df , nan_policy )
# Perform logistic regression
logit_result = regress ( sm . Logit , df , dep , formula , * * kwargs )
logit_params = logit_result . raw_result . params
2022-10-17 20:34:36 +11:00
# Check convergence
if not logit_result . raw_result . mle_retvals [ ' converged ' ] :
return None
2022-10-16 02:31:37 +11:00
# Perform desired regression
return regress ( model_class , df , dep , formula , start_params = logit_params , * * kwargs )
2022-10-13 17:23:29 +11:00
# -----------------------------
# Penalised logistic regression
class PenalisedLogit ( statsmodels . discrete . discrete_model . BinaryModel ) :
"""
2022-12-02 20:19:08 +11:00
statsmodels - compatible model for computing Firth penalised logistic regression
2022-10-17 21:41:19 +11:00
Uses the R * logistf * library .
2022-10-13 17:23:29 +11:00
2022-10-17 21:41:19 +11:00
This class should be used in conjunction with : func : ` yli . regress ` .
2022-10-18 19:25:12 +11:00
* * Example : * *
. . code - block : :
df = pd . DataFrame ( {
' Pred ' : [ 1 ] * 20 + [ 0 ] * 220 ,
' Outcome ' : [ 1 ] * 40 + [ 0 ] * 200
} )
yli . regress ( yli . PenalisedLogit , df , ' Outcome ' , ' Pred ' , exp = False )
. . code - block : : text
Penalised Logistic Regression Results
== == == == == == == == == == == == == == == == == == == == == == == == == == == == =
Dep . Variable : Outcome | No . Observations : 240
Model : Logit | Df . Model : 1
Method : Penalised ML | Pseudo R² : 0.37
Date : 2022 - 10 - 19 | LL - Model : - 66.43
Time : 07 : 50 : 40 | LL - Null : - 105.91
Std . Errors : Non - Robust | p ( LR ) : < 0.001 *
== == == == == == == == == == == == == == == == == == == == == == == == == == == == =
β ( 95 % CI ) p
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
( Intercept ) - 2.28 ( - 2.77 - - 1.85 ) < 0.001 *
Pred 5.99 ( 3.95 - 10.85 ) < 0.001 *
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
The output summarises the result of the regression .
The summary table shows that a penalised method was applied .
Note that because ` exp = False ` was passed , the parameter estimates are not automatically exponentiated .
2022-10-13 17:23:29 +11:00
"""
2022-10-17 20:34:58 +11:00
def fit ( self , disp = False ) :
2022-10-13 17:23:29 +11:00
import rpy2 . robjects as ro
import rpy2 . robjects . packages
import rpy2 . robjects . pandas2ri
# Assume data is already cleaned from regress()
2022-11-29 14:40:50 +11:00
df = self . data . orig_endog . join ( self . data . orig_exog )
2022-10-13 17:23:29 +11:00
# Convert bool to int otherwise rpy2 chokes
df = df . replace ( { False : 0 , True : 1 } )
# Import logistf
ro . packages . importr ( ' logistf ' )
with ro . conversion . localconverter ( ro . default_converter + ro . pandas2ri . converter ) :
with ro . local_context ( ) as lc :
# Convert DataFrame to R
lc [ ' df ' ] = df
# Transfer other parameters to R
lc [ ' formula_ ' ] = self . formula
2022-10-15 23:24:29 +11:00
lc [ ' alpha ' ] = config . alpha
2022-10-13 17:23:29 +11:00
# Fit the model
2022-10-15 23:24:29 +11:00
model = ro . r ( ' logistf(formula_, data=df, alpha=alpha) ' )
2022-10-13 17:23:29 +11:00
2022-10-14 21:57:22 +11:00
# TODO: Handle categorical terms?
terms = { t : SingleTerm ( t , Estimate ( b , ci0 , ci1 ) , p ) for t , b , ci0 , ci1 , p in zip ( model [ ' terms ' ] , model [ ' coefficients ' ] , model [ ' ci.lower ' ] , model [ ' ci.upper ' ] , model [ ' prob ' ] ) }
2022-10-13 17:23:29 +11:00
return RegressionResult (
2022-12-02 21:01:07 +11:00
None , None , None , None , None , None , None , # Set in regress()
2022-10-13 17:23:29 +11:00
model ,
' Penalised Logistic Regression ' , ' Logit ' , ' Penalised ML ' ,
2023-02-25 14:46:13 +11:00
model [ ' n ' ] [ 0 ] , None , model [ ' df ' ] [ 0 ] , datetime . now ( ) , ' nonrobust ' ,
2022-10-14 21:57:22 +11:00
terms ,
2022-10-13 17:23:29 +11:00
model [ ' loglik ' ] [ 0 ] , model [ ' loglik ' ] [ 1 ] ,
None , None , None ,
2022-12-02 12:53:40 +11:00
[ ] ,
2022-10-13 17:23:29 +11:00
None # Set exp in regress()
)
2022-12-02 12:53:40 +11:00
2022-12-02 20:19:08 +11:00
# ------------------------------------------------------
# Ordinal logistic regression (R/Stata parameterisation)
class OrdinalLogit ( statsmodels . miscmodels . ordinal_model . OrderedModel ) :
"""
statsmodels - compatible model for computing ordinal logistic ( or probit ) regression
The implementation subclasses statsmodels ' native *OrderedModel*, but substitutes an alternative parameterisation used by R and Stata.
2022-12-03 02:02:34 +11:00
In the native statsmodels implementation , the first cutoff parameter is the true cutoff , but further cutoff parameter are log differences between consecutive cutoffs .
2022-12-02 20:19:08 +11:00
In this parameterisation , cutoff terms are represented directly in the model .
2022-12-02 21:43:05 +11:00
* * Example : * *
. . code - block : :
df = pd . DataFrame ( . . . )
yli . regress ( yli . OrdinalLogit , df , ' apply ' , ' pared + public + gpa ' , exp = False )
. . code - block : : text
Ordinal Logistic Regression Results
== == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == =
Dep . Variable : apply | No . Observations : 400
Model : OrdinalLogit | Df . Model : 5
Method : Maximum Likelihood | Df . Residuals : 395
Date : 2022 - 12 - 02 | Pseudo R² : 0.03
Time : 21 : 30 : 38 | LL - Model : - 358.51
Std . Errors : Non - Robust | LL - Null : - 370.60
| p ( LR ) : < 0.001 *
== == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == =
β ( 95 % CI ) p
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
pared 1.05 ( 0.53 - 1.57 ) < 0.001 *
public - 0.06 ( - 0.64 - 0.53 ) 0.84
gpa 0.62 ( 0.10 - 1.13 ) 0.02 *
( Cutoffs )
unlikely / somewhat likely 2.20 ( 0.68 - 3.73 ) 0.005 *
somewhat likely / very likely 4.30 ( 2.72 - 5.88 ) < 0.001 *
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
The output summarises the result of the regression .
The parameters shown under " (Cutoffs) " are the cutoff values in the latent variable parameterisation of ordinal regression .
Note that because ` exp = False ` was passed , the parameter estimates are not automatically exponentiated .
2022-12-02 20:19:08 +11:00
"""
def __init__ ( self , endog , exog , * * kwargs ) :
if ' distr ' not in kwargs :
kwargs [ ' distr ' ] = ' logit '
super ( ) . __init__ ( endog , exog , * * kwargs )
def transform_threshold_params ( self , params ) :
th_params = params [ - ( self . k_levels - 1 ) : ]
thresh = np . concatenate ( ( [ - np . inf ] , th_params , [ np . inf ] ) )
return thresh
def transform_reverse_threshold_params ( self , params ) :
return params [ : - 1 ]
2022-12-02 20:20:25 +11:00
class _Dummy : pass
def _wald_test ( param_names , params , formula , vcov , df ) :
# Hack! Piggyback off statsmodels to compute a Wald test
lmr = statsmodels . base . model . LikelihoodModelResults ( model = None , params = None )
lmr . model = _Dummy ( )
lmr . model . data = _Dummy ( )
lmr . model . data . cov_names = param_names
lmr . params = params
lmr . df_resid = df
return lmr . wald_test ( formula , cov_p = vcov , use_f = False , scalar = True )
class BrantResult :
2022-12-02 22:03:25 +11:00
"""
Result of a Brant test for ordinal regression
See : meth : ` RegressionResult . brant ` .
"""
2022-12-02 20:20:25 +11:00
def __init__ ( self , tests ) :
#: Results for Brant test on each coefficient (*Dict[str, ChiSquaredResult]*)
self . tests = tests
def __repr__ ( self ) :
if config . repr_is_summary :
return self . summary ( )
return super ( ) . __repr__ ( )
def _repr_html_ ( self ) :
out = ' <table><caption>Brant Test Results</caption><thead><tr><th></th><th style= " text-align:center " ><i>χ</i><sup>2</sup></th><th style= " text-align:center " >df</th><th style= " text-align:center " ><i>p</i></th></thead><tbody> '
for raw_name , test in self . tests . items ( ) :
out + = ' <tr><th> {} </th><td> {:.2f} </td><td> {:.0f} </td><td style= " text-align:left " > {} </td></tr> ' . format ( raw_name , test . statistic , test . dof , fmt_p ( test . pvalue , PValueStyle . TABULAR | PValueStyle . HTML ) )
out + = ' </tbody></table> '
return out
def summary ( self ) :
"""
Return a stringified summary of the * χ * : sup : ` 2 ` test
: rtype : str
"""
2022-12-02 21:53:07 +11:00
table = pd . DataFrame ( [
[ ' {:.2f} ' . format ( test . statistic ) , ' {:.0f} ' . format ( test . dof ) , fmt_p ( test . pvalue , PValueStyle . TABULAR ) ]
for test in self . tests . values ( )
] , index = self . tests . keys ( ) , columns = [ ' χ² ' , ' df ' , ' p ' ] )
return str ( table )