2022-10-13 15:57:56 +11:00
# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import numpy as np
2022-10-13 16:24:01 +11:00
import pandas as pd
2022-11-29 14:40:50 +11:00
import patsy
2022-10-13 15:57:56 +11:00
from scipy import stats
2022-10-13 17:23:29 +11:00
import statsmodels
2022-10-13 15:57:56 +11:00
import statsmodels . api as sm
from statsmodels . iolib . table import SimpleTable
2022-12-02 12:53:40 +11:00
from statsmodels . miscmodels . ordinal_model import OrderedModel
2022-10-13 16:24:01 +11:00
from statsmodels . stats . outliers_influence import variance_inflation_factor
2022-11-29 23:00:14 +11:00
from tqdm import tqdm
2022-10-13 15:57:56 +11:00
from datetime import datetime
import itertools
2022-10-18 19:25:12 +11:00
import warnings
2022-10-13 15:57:56 +11:00
2022-10-14 14:09:29 +11:00
from . bayes_factors import BayesFactor , bayesfactor_afbf
2022-10-15 23:24:29 +11:00
from . config import config
2022-10-14 20:18:25 +11:00
from . sig_tests import FTestResult
2022-11-09 17:39:33 +11:00
from . utils import Estimate , PValueStyle , check_nan , cols_for_formula , convert_pandas_nullable , fmt_p , formula_factor_ref_category , parse_patsy_term
2022-10-13 15:57:56 +11:00
2022-10-17 21:41:19 +11:00
def vif ( df , formula = None , * , nan_policy = ' warn ' ) :
2022-10-14 14:08:51 +11:00
"""
2022-10-17 21:41:19 +11:00
Calculate the variance inflation factor ( VIF ) for each variable in * df *
2022-10-14 14:08:51 +11:00
2022-10-17 21:41:19 +11:00
: param df : Data to calculate the VIF for
: type df : DataFrame
: param formula : If specified , calculate the VIF only for the variables in the formula
: type formula : str
2022-10-18 19:25:12 +11:00
: return : The variance inflation factors
: rtype : Series
* * Example : * *
. . code - block : :
df = pd . DataFrame ( {
' D ' : [ 68.58 , 67.33 , 67.33 , . . . ] ,
' T1 ' : [ 14 , 10 , 10 , . . . ] ,
' T2 ' : [ 46 , 73 , 85 , . . . ] ,
. . .
} )
yli . vif ( df [ [ ' D ' , ' T1 ' , ' T2 ' , . . . ] ] )
. . code - block : : text
D 8.318301
T1 6.081590
T2 2.457122
. . .
dtype : float64
The output shows the variance inflation factor for each variable in * df * .
2022-10-14 14:08:51 +11:00
"""
if formula :
# Only consider columns in the formula
2022-10-15 01:09:40 +11:00
df = df [ cols_for_formula ( formula , df ) ]
2022-10-13 16:24:01 +11:00
# Check for/clean NaNs
df = check_nan ( df , nan_policy )
# Convert all to float64 otherwise statsmodels chokes with "ufunc 'isfinite' not supported for the input types ..."
df = pd . get_dummies ( df , drop_first = True ) # Convert categorical dtypes
df = df . astype ( ' float64 ' ) # Convert all other dtypes
# Add intercept column
orig_columns = list ( df . columns )
df [ ' Intercept ' ] = [ 1 ] * len ( df )
vifs = { }
for i , col in enumerate ( orig_columns ) :
vifs [ col ] = variance_inflation_factor ( df , i )
return pd . Series ( vifs )
2022-10-13 15:57:56 +11:00
# ----------
# Regression
class LikelihoodRatioTestResult :
2022-10-17 21:41:19 +11:00
"""
Result of a likelihood ratio test for regression
See : meth : ` RegressionResult . lrtest_null ` .
"""
2022-10-13 15:57:56 +11:00
def __init__ ( self , statistic , dof , pvalue ) :
2022-10-17 21:41:19 +11:00
#: Likelihood ratio test statistic (*float*)
2022-10-13 15:57:56 +11:00
self . statistic = statistic
2022-10-17 21:41:19 +11:00
#: Degrees of freedom for the likelihood ratio test statistic (*int*)
2022-10-13 15:57:56 +11:00
self . dof = dof
2022-10-17 21:41:19 +11:00
#: *p* value for the likelihood ratio test (*float*)
2022-10-13 15:57:56 +11:00
self . pvalue = pvalue
2022-10-18 18:44:04 +11:00
def __repr__ ( self ) :
if config . repr_is_summary :
return self . summary ( )
return super ( ) . __repr__ ( )
2022-10-13 15:57:56 +11:00
def _repr_html_ ( self ) :
2022-11-09 17:39:33 +11:00
return ' LR( {} ) = {:.2f} ; <i>p</i> {} ' . format ( self . dof , self . statistic , fmt_p ( self . pvalue , PValueStyle . RELATION | PValueStyle . HTML ) )
2022-10-13 15:57:56 +11:00
def summary ( self ) :
2022-10-17 21:41:19 +11:00
"""
Return a stringified summary of the likelihood ratio test
: rtype : str
"""
2022-11-09 17:39:33 +11:00
return ' LR( {} ) = {:.2f} ; p {} ' . format ( self . dof , self . statistic , fmt_p ( self . pvalue , PValueStyle . RELATION ) )
2022-10-13 15:57:56 +11:00
class RegressionResult :
"""
Result of a regression
2022-10-17 21:41:19 +11:00
See : func : ` yli . regress ` .
2022-10-13 15:57:56 +11:00
"""
def __init__ ( self ,
raw_result ,
full_name , model_name , fit_method ,
2022-10-17 20:34:58 +11:00
dep , nobs , dof_model , fitted_dt , cov_type ,
2022-10-14 21:57:22 +11:00
terms ,
2022-10-13 15:57:56 +11:00
llf , llnull ,
dof_resid , rsquared , f_statistic ,
2022-12-02 12:53:40 +11:00
comments ,
2022-10-13 15:57:56 +11:00
exp
) :
2022-10-17 21:41:19 +11:00
#: Raw result from statsmodels *model.fit*
2022-10-13 15:57:56 +11:00
self . raw_result = raw_result
# Information for display
2022-10-17 21:41:19 +11:00
#: Full name of the regression model type (*str*)
2022-10-13 15:57:56 +11:00
self . full_name = full_name
2022-10-17 21:41:19 +11:00
#: Short name of the regression model type (*str*)
2022-10-13 15:57:56 +11:00
self . model_name = model_name
2022-10-17 21:41:19 +11:00
#: Method for fitting the regression model (*str*)
2022-10-13 15:57:56 +11:00
self . fit_method = fit_method
# Basic fitted model information
2022-10-17 21:41:19 +11:00
#: Name of the dependent variable (*str*)
2022-10-13 15:57:56 +11:00
self . dep = dep
2022-10-17 21:41:19 +11:00
#: Number of observations (*int*)
2022-10-13 15:57:56 +11:00
self . nobs = nobs
2022-10-17 21:41:19 +11:00
#: Degrees of freedom for the model (*int*)
2022-10-13 15:57:56 +11:00
self . dof_model = dof_model
2022-10-17 21:41:19 +11:00
#: Date and time of fitting the model (Python *datetime*)
2022-10-13 15:57:56 +11:00
self . fitted_dt = fitted_dt
2022-10-17 21:41:19 +11:00
#: Method for computing the covariance matrix (*str*)
2022-10-17 20:34:58 +11:00
self . cov_type = cov_type
2022-10-13 15:57:56 +11:00
2022-10-14 21:57:22 +11:00
# Regression coefficients/p values
2022-10-17 21:41:19 +11:00
#: Coefficients and *p* values for each term in the model (*dict* of :class:`SingleTerm` or :class:`CategoricalTerm`)
2022-10-14 21:57:22 +11:00
self . terms = terms
2022-10-13 15:57:56 +11:00
# Model log-likelihood
2022-10-17 21:41:19 +11:00
#: Log-likelihood of fitted model (*float*)
2022-10-13 15:57:56 +11:00
self . llf = llf
2022-10-17 21:41:19 +11:00
#: Log-likelihood of null model (*float*)
2022-10-13 15:57:56 +11:00
self . llnull = llnull
# Extra statistics (not all regression models have these)
2022-10-17 21:41:19 +11:00
#: Degrees of freedom for the residuals (*int*; *None* if N/A)
2022-10-13 15:57:56 +11:00
self . dof_resid = dof_resid
2022-10-17 21:41:19 +11:00
#: *R*:sup:`2` statistic (*float*; *None* if N/A)
2022-10-13 15:57:56 +11:00
self . rsquared = rsquared
2022-10-17 21:41:19 +11:00
#: *F* statistic (*float*; *None* if N/A)
2022-10-13 15:57:56 +11:00
self . f_statistic = f_statistic
2022-12-02 12:53:40 +11:00
#: Comments for the model (*List[str]*)
self . comments = comments or [ ]
2022-10-13 15:57:56 +11:00
# Config for display style
2022-10-17 21:41:19 +11:00
#: See :func:`yli.regress`
2022-10-13 15:57:56 +11:00
self . exp = exp
@property
def pseudo_rsquared ( self ) :
2022-10-17 21:41:19 +11:00
""" McFadden ' s pseudo *R*:sup:`2` statistic """
2022-10-13 15:57:56 +11:00
return 1 - self . llf / self . llnull
def lrtest_null ( self ) :
2022-10-17 21:41:19 +11:00
"""
Compute the likelihood ratio test comparing the model to the null model
: rtype : : class : ` LikelihoodRatioTestResult `
"""
2022-10-13 15:57:56 +11:00
statistic = - 2 * ( self . llnull - self . llf )
pvalue = 1 - stats . chi2 . cdf ( statistic , self . dof_model )
return LikelihoodRatioTestResult ( statistic , self . dof_model , pvalue )
def ftest ( self ) :
2022-10-17 21:41:19 +11:00
"""
Perform the * F * test that all slopes are 0
: rtype : : class : ` yli . sig_tests . FTestResult `
"""
2022-10-13 15:57:56 +11:00
pvalue = 1 - stats . f ( self . dof_model , self . dof_resid ) . cdf ( self . f_statistic )
return FTestResult ( self . f_statistic , self . dof_model , self . dof_resid , pvalue )
2022-10-14 14:09:29 +11:00
def bayesfactor_beta_zero ( self , term ) :
"""
2022-10-17 21:41:19 +11:00
Compute a Bayes factor testing the hypothesis that the given beta is 0
2022-10-16 02:49:43 +11:00
2022-10-17 21:41:19 +11:00
Uses the R * BFpack * library .
Requires the regression to be from statsmodels .
The term must be specified as the * raw name * from the statsmodels regression , available via : attr : ` RegressionResult . raw_result ` .
: param term : Raw name of the term to be tested
: type term : str
: rtype : : class : ` yli . bayes_factors . BayesFactor `
2022-10-14 14:09:29 +11:00
"""
2022-10-16 02:49:43 +11:00
# FIXME: Allow specifying our renamed terms
2022-10-14 14:09:29 +11:00
# Get parameters required for AFBF
2022-10-16 02:49:43 +11:00
params = pd . Series ( { raw_name . replace ( ' [ ' , ' _ ' ) . replace ( ' ] ' , ' _ ' ) : beta for raw_name , beta in self . raw_result . params . items ( ) } )
2022-10-14 14:09:29 +11:00
cov = self . raw_result . cov_params ( )
# Compute BF matrix
bf01 = bayesfactor_afbf ( params , cov , self . nobs , ' {} = 0 ' . format ( term . replace ( ' [ ' , ' _ ' ) . replace ( ' ] ' , ' _ ' ) ) )
bf01 = BayesFactor ( bf01 . factor , ' 0 ' , ' {} = 0 ' . format ( term ) , ' 1 ' , ' {} ≠ 0 ' . format ( term ) )
if bf01 . factor > = 1 :
return bf01
else :
return bf01 . invert ( )
2022-10-13 15:57:56 +11:00
def _header_table ( self , html ) :
""" Return the entries for the header table """
# Left column
left_col = [ ]
left_col . append ( ( ' Dep. Variable: ' , self . dep ) )
left_col . append ( ( ' Model: ' , self . model_name ) )
left_col . append ( ( ' Method: ' , self . fit_method ) )
left_col . append ( ( ' Date: ' , self . fitted_dt . strftime ( ' % Y- % m- %d ' ) ) )
left_col . append ( ( ' Time: ' , self . fitted_dt . strftime ( ' % H: % M: % S ' ) ) )
2022-12-02 12:53:40 +11:00
if self . cov_type :
left_col . append ( ( ' Std. Errors: ' , ' Non-Robust ' if self . cov_type == ' nonrobust ' else self . cov_type . upper ( ) if self . cov_type . startswith ( ' hc ' ) else self . cov_type ) )
2022-10-13 15:57:56 +11:00
# Right column
right_col = [ ]
2022-10-17 20:34:58 +11:00
right_col . append ( ( ' No. Observations: ' , format ( self . nobs , ' .0f ' ) ) )
2022-10-16 02:30:24 +11:00
right_col . append ( ( ' Df. Model: ' , format ( self . dof_model , ' .0f ' ) ) )
2022-10-13 15:57:56 +11:00
if self . dof_resid :
right_col . append ( ( ' Df. Residuals: ' , format ( self . dof_resid , ' .0f ' ) ) )
if self . rsquared :
right_col . append ( ( ' <i>R</i><sup>2</sup>: ' if html else ' R²: ' , format ( self . rsquared , ' .2f ' ) ) )
else :
right_col . append ( ( ' Pseudo <i>R</i><sup>2</sup>: ' if html else ' Pseudo R²: ' , format ( self . pseudo_rsquared , ' .2f ' ) ) )
if self . f_statistic :
# Report the F test if available
f_result = self . ftest ( )
if html :
right_col . append ( ( ' <i>F</i>: ' , format ( f_result . statistic , ' .2f ' ) ) )
2022-11-09 17:39:33 +11:00
right_col . append ( ( ' <i>p</i> (<i>F</i>): ' , fmt_p ( f_result . pvalue , PValueStyle . VALUE_ONLY | PValueStyle . HTML ) ) )
2022-10-13 15:57:56 +11:00
else :
right_col . append ( ( ' F: ' , format ( f_result . statistic , ' .2f ' ) ) )
2022-11-09 17:39:33 +11:00
right_col . append ( ( ' p (F): ' , fmt_p ( f_result . pvalue , PValueStyle . VALUE_ONLY ) ) )
2022-10-13 15:57:56 +11:00
else :
# Otherwise report likelihood ratio test as overall test
lrtest_result = self . lrtest_null ( )
right_col . append ( ( ' LL-Model: ' , format ( self . llf , ' .2f ' ) ) )
right_col . append ( ( ' LL-Null: ' , format ( self . llnull , ' .2f ' ) ) )
if html :
2022-11-09 17:39:33 +11:00
right_col . append ( ( ' <i>p</i> (LR): ' , fmt_p ( lrtest_result . pvalue , PValueStyle . VALUE_ONLY | PValueStyle . HTML ) ) )
2022-10-13 15:57:56 +11:00
else :
2022-11-09 17:39:33 +11:00
right_col . append ( ( ' p (LR): ' , fmt_p ( lrtest_result . pvalue , PValueStyle . VALUE_ONLY ) ) )
2022-10-13 15:57:56 +11:00
return left_col , right_col
2022-10-18 18:44:04 +11:00
def __repr__ ( self ) :
if config . repr_is_summary :
return self . summary ( )
return super ( ) . __repr__ ( )
2022-10-13 15:57:56 +11:00
def _repr_html_ ( self ) :
# Render header table
left_col , right_col = self . _header_table ( html = True )
out = ' <table><caption> {} Results</caption> ' . format ( self . full_name )
for left_cell , right_cell in itertools . zip_longest ( left_col , right_col ) :
out + = ' <tr><th> {} </th><td> {} </td><th> {} </th><td> {} </td></tr> ' . format (
left_cell [ 0 ] if left_cell else ' ' ,
left_cell [ 1 ] if left_cell else ' ' ,
right_cell [ 0 ] if right_cell else ' ' ,
right_cell [ 1 ] if right_cell else ' '
)
out + = ' </table> '
# Render coefficients table
2022-10-15 23:24:29 +11:00
out + = ' <table><tr><th></th><th style= " text-align:center " > {} </th><th colspan= " 3 " style= " text-align:center " >( {:g} % CI)</th><th style= " text-align:center " ><i>p</i></th></tr> ' . format ( ' exp(<i>β</i>) ' if self . exp else ' <i>β</i> ' , ( 1 - config . alpha ) * 100 )
2022-10-13 15:57:56 +11:00
2022-10-14 21:57:22 +11:00
for term_name , term in self . terms . items ( ) :
if isinstance ( term , SingleTerm ) :
# Single term
# Exponentiate beta if requested
beta = term . beta
if self . exp :
beta = np . exp ( beta )
2022-11-09 17:39:33 +11:00
out + = ' <tr><th> {} </th><td> {:.2f} </td><td style= " padding-right:0 " >( {:.2f} </td><td>–</td><td style= " padding-left:0 " > {:.2f} )</td><td style= " text-align:left " > {} </td></tr> ' . format ( term_name , beta . point , beta . ci_lower , beta . ci_upper , fmt_p ( term . pvalue , PValueStyle . TABULAR | PValueStyle . HTML ) )
2022-10-14 21:57:22 +11:00
elif isinstance ( term , CategoricalTerm ) :
# Categorical term
out + = ' <tr><th> {} </th><td></td><td style= " padding-right:0 " ></td><td></td><td style= " padding-left:0 " ></td><td></td></tr> ' . format ( term_name )
# Render reference category
out + = ' <tr><td style= " text-align:right;font-style:italic " > {} </td><td>Ref.</td><td style= " padding-right:0 " ></td><td></td><td style= " padding-left:0 " ></td><td></td></tr> ' . format ( term . ref_category )
# Loop over terms
for sub_term_name , sub_term in term . categories . items ( ) :
# Exponentiate beta if requested
beta = sub_term . beta
if self . exp :
beta = np . exp ( beta )
2022-11-09 17:39:33 +11:00
out + = ' <tr><td style= " text-align:right;font-style:italic " > {} </td><td> {:.2f} </td><td style= " padding-right:0 " >( {:.2f} </td><td>–</td><td style= " padding-left:0 " > {:.2f} )</td><td style= " text-align:left " > {} </td></tr> ' . format ( sub_term_name , beta . point , beta . ci_lower , beta . ci_upper , fmt_p ( sub_term . pvalue , PValueStyle . TABULAR | PValueStyle . HTML ) )
2022-10-14 21:57:22 +11:00
else :
raise Exception ( ' Attempt to render unknown term type ' )
2022-10-13 15:57:56 +11:00
out + = ' </table> '
# TODO: Have a detailed view which shows SE, t/z, etc.
2022-12-02 12:53:40 +11:00
if self . comments :
out + = ' <ol> '
for comment in self . comments :
out + = ' <li> {} </li> ' . format ( comment )
out + = ' </ol> '
2022-10-13 15:57:56 +11:00
return out
def summary ( self ) :
2022-10-17 21:41:19 +11:00
"""
Return a stringified summary of the regression model
: rtype : str
"""
2022-10-13 15:57:56 +11:00
# Render header table
left_col , right_col = self . _header_table ( html = False )
# Ensure equal sizes for SimpleTable
if len ( right_col ) > len ( left_col ) :
left_col . extend ( [ ( ' ' , ' ' ) ] * ( len ( right_col ) - len ( left_col ) ) )
elif len ( left_col ) > len ( right_col ) :
right_col . extend ( [ ( ' ' , ' ' ) ] * ( len ( left_col ) - len ( right_col ) ) )
table1 = SimpleTable ( np . concatenate ( [ left_col , right_col ] , axis = 1 ) , title = ' {} Results ' . format ( self . full_name ) )
table1 . insert_stubs ( 2 , [ ' | ' ] * len ( left_col ) )
# Get rid of last line (merge with next table)
table1_lines = table1 . as_text ( ) . splitlines ( keepends = False )
out = ' \n ' . join ( table1_lines [ : - 1 ] ) + ' \n '
# Render coefficients table
table_data = [ ]
2022-10-14 21:57:22 +11:00
for term_name , term in self . terms . items ( ) :
if isinstance ( term , SingleTerm ) :
# Single term
# Exponentiate beta if requested
beta = term . beta
if self . exp :
beta = np . exp ( beta )
# Add some extra padding
2022-11-09 17:39:33 +11:00
table_data . append ( [ term_name + ' ' , format ( beta . point , ' .2f ' ) , ' ( {:.2f} ' . format ( beta . ci_lower ) , ' - ' , ' {:.2f} ) ' . format ( beta . ci_upper ) , ' ' + fmt_p ( term . pvalue , PValueStyle . TABULAR ) ] )
2022-10-14 21:57:22 +11:00
elif isinstance ( term , CategoricalTerm ) :
# Categorical term
table_data . append ( [ term_name + ' ' , ' ' , ' ' , ' ' , ' ' , ' ' ] )
# Render reference category
table_data . append ( [ ' {} ' . format ( term . ref_category ) , ' Ref. ' , ' ' , ' ' , ' ' , ' ' ] )
# Loop over terms
for sub_term_name , sub_term in term . categories . items ( ) :
# Exponentiate beta if requested
beta = sub_term . beta
if self . exp :
beta = np . exp ( beta )
2022-11-09 17:39:33 +11:00
table_data . append ( [ sub_term_name + ' ' , format ( beta . point , ' .2f ' ) , ' ( {:.2f} ' . format ( beta . ci_lower ) , ' - ' , ' {:.2f} ) ' . format ( beta . ci_upper ) , ' ' + fmt_p ( sub_term . pvalue , PValueStyle . TABULAR ) ] )
2022-10-14 21:57:22 +11:00
else :
raise Exception ( ' Attempt to render unknown term type ' )
2022-10-13 15:57:56 +11:00
table2 = SimpleTable ( data = table_data , headers = [ ' ' , ' exp(β) ' if self . exp else ' β ' , ' ' , ' \ue000 ' , ' ' , ' p ' ] ) # U+E000 is in Private Use Area, mark middle of CI column
2022-10-15 23:24:29 +11:00
table2_text = table2 . as_text ( ) . replace ( ' \ue000 ' , ' ( {:g} % CI) ' . format ( ( 1 - config . alpha ) * 100 ) ) # Render heading in the right spot
2022-10-13 15:57:56 +11:00
table2_lines = table2_text . splitlines ( keepends = False )
# Render divider line between 2 tables
max_table_len = max ( len ( table1_lines [ - 1 ] ) , len ( table2_lines [ - 1 ] ) )
out + = ' = ' * max_table_len + ' \n '
out + = ' \n ' . join ( table2_lines [ 1 : ] )
2022-12-02 12:53:40 +11:00
if self . comments :
out + = ' \n '
for i , comment in enumerate ( self . comments ) :
out + = ' \n {} . {} ' . format ( i + 1 , comment )
2022-10-13 15:57:56 +11:00
return out
2022-10-14 21:57:22 +11:00
class SingleTerm :
2022-10-17 21:41:19 +11:00
""" A term in a :class:`RegressionResult` which is a single term """
2022-10-14 21:57:22 +11:00
def __init__ ( self , raw_name , beta , pvalue ) :
2022-10-17 21:41:19 +11:00
#: Raw name of the term (*str*; e.g. in :attr:`RegressionResult.raw_result`)
2022-10-14 21:57:22 +11:00
self . raw_name = raw_name
2022-10-17 21:41:19 +11:00
#: :class:`yli.utils.Estimate` of the coefficient
2022-10-14 21:57:22 +11:00
self . beta = beta
2022-10-17 21:41:19 +11:00
#: *p* value for the coefficient (*float*)
2022-10-14 21:57:22 +11:00
self . pvalue = pvalue
class CategoricalTerm :
2022-10-17 21:41:19 +11:00
""" A group of terms in a :class:`RegressionResult` corresponding to a categorical variable """
2022-10-14 21:57:22 +11:00
def __init__ ( self , categories , ref_category ) :
2022-10-17 21:41:19 +11:00
#: Terms for each of the categories, excluding the reference category (*dict* of :class:`SingleTerm`)
2022-10-14 21:57:22 +11:00
self . categories = categories
2022-10-17 21:41:19 +11:00
#: Name of the reference category (*str*)
2022-10-14 21:57:22 +11:00
self . ref_category = ref_category
2022-10-13 15:57:56 +11:00
def regress (
model_class , df , dep , formula , * ,
2022-10-15 01:42:06 +11:00
nan_policy = ' warn ' ,
2022-10-16 02:31:08 +11:00
model_kwargs = None , fit_kwargs = None ,
family = None , # common model_kwargs
2022-12-02 12:54:03 +11:00
cov_type = None , method = None , maxiter = None , start_params = None , # common fit_kwargs
2022-11-29 14:40:50 +11:00
bool_baselevels = False , exp = None ,
2022-11-29 23:00:14 +11:00
_dmatrices = None ,
2022-10-13 15:57:56 +11:00
) :
2022-10-15 01:42:06 +11:00
"""
Fit a statsmodels regression model
2022-10-17 21:41:19 +11:00
: param model_class : Type of regression model to fit
: type model_class : statsmodels model class
: param df : Data to perform regression on
: type df : DataFrame
: param dep : Column in * df * for the dependent variable ( numeric )
: type dep : str
: param formula : Patsy formula for the regression model
: type formula : str
: param nan_policy : How to handle * nan * values ( see : ref : ` nan - handling ` )
: type nan_policy : str
: param model_kwargs : Keyword arguments to pass to * model_class * constructor
: type model_kwargs : dict
: param fit_kwargs : Keyword arguments to pass to * model . fit *
: type fit_kwargs : dict
: param family : See statsmodels * GLM * constructor
: param cov_type : See statsmodels * model . fit *
2022-12-02 12:54:03 +11:00
: param method : See statsmodels * model . fit *
2022-10-17 21:41:19 +11:00
: param maxiter : See statsmodels * model . fit *
: param start_params : See statsmodels * model . fit *
: param bool_baselevels : Show reference categories for boolean independent variables even if reference category is * False *
: type bool_baselevels : bool
: param exp : Report exponentiated parameters rather than raw parameters , default ( * None * ) is to autodetect based on * model_class *
: type exp : bool
: rtype : : class : ` yli . regress . RegressionResult `
2022-10-18 19:25:12 +11:00
* * Example : * *
. . code - block : :
df = pd . DataFrame ( {
' Unhealthy ' : [ False , False , False , . . . ] ,
' Fibrinogen ' : [ 2.52 , 2.46 , 2.29 , . . . ] ,
' GammaGlobulin ' : [ 38 , 36 , 36 , . . . ]
} )
yli . regress ( sm . Logit , df , ' Unhealthy ' , ' Fibrinogen + GammaGlobulin ' )
. . code - block : : text
Logistic Regression Results
== == == == == == == == == == == == == == == == == == == == == == == == == == ==
Dep . Variable : Unhealthy | No . Observations : 32
Model : Logit | Df . Model : 2
Method : MLE | Df . Residuals : 29
Date : 2022 - 10 - 18 | Pseudo R² : 0.26
Time : 19 : 00 : 34 | LL - Model : - 11.47
Std . Errors : Non - Robust | LL - Null : - 15.44
| p ( LR ) : 0.02 *
== == == == == == == == == == == == == == == == == == == == == == == == == == ==
exp ( β ) ( 95 % CI ) p
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
( Intercept ) 0.00 ( 0.00 - 0.24 ) 0.03 *
Fibrinogen 6.80 ( 1.01 - 45.79 ) 0.049 *
GammaGlobulin 1.17 ( 0.92 - 1.48 ) 0.19
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
The output summarises the results of the regression .
Note that the parameter estimates are automatically exponentiated .
For example , the odds ratio for unhealthiness per unit increase in fibrinogen is 6.80 , with 95 % confidence interval 1.01 – 45.79 , and is significant with * p * value 0.049 .
2022-10-15 01:42:06 +11:00
"""
2022-10-13 15:57:56 +11:00
2022-10-16 02:31:08 +11:00
# Populate model_kwargs
if model_kwargs is None :
model_kwargs = { }
if family is not None :
model_kwargs [ ' family ' ] = family
# Populate fit_kwargs
if fit_kwargs is None :
fit_kwargs = { }
if cov_type is not None :
fit_kwargs [ ' cov_type ' ] = cov_type
2022-12-02 12:54:03 +11:00
if method is not None :
fit_kwargs [ ' method ' ] = method
2022-10-16 02:31:08 +11:00
if maxiter is not None :
fit_kwargs [ ' maxiter ' ] = maxiter
if start_params is not None :
fit_kwargs [ ' start_params ' ] = start_params
2022-10-13 15:57:56 +11:00
# Autodetect whether to exponentiate
if exp is None :
2022-10-16 02:31:08 +11:00
if model_class in ( sm . Logit , sm . Poisson , PenalisedLogit ) :
2022-10-13 15:57:56 +11:00
exp = True
2022-12-02 12:53:40 +11:00
elif model_class is OrderedModel and model_kwargs . get ( ' distr ' , ' probit ' ) == ' logit ' :
exp = True
2022-10-13 15:57:56 +11:00
else :
exp = False
2022-11-29 14:40:50 +11:00
if _dmatrices is None :
# Check for/clean NaNs
df = df [ [ dep ] + cols_for_formula ( formula , df ) ]
df = check_nan ( df , nan_policy )
# Ensure numeric type for dependent variable
if df [ dep ] . dtype != ' float64 ' :
df [ dep ] = df [ dep ] . astype ( ' float64 ' )
# Convert pandas nullable types for independent variables as this breaks statsmodels
df = convert_pandas_nullable ( df )
# Construct design matrix from formula
dmatrices = patsy . dmatrices ( dep + ' ~ ' + formula , df , return_type = ' dataframe ' )
else :
dmatrices = _dmatrices
2022-10-13 15:57:56 +11:00
2022-12-02 12:53:40 +11:00
if model_class is OrderedModel :
# Drop explicit intercept term
# FIXME: Check before dropping
dmatrices = ( dmatrices [ 0 ] , dmatrices [ 1 ] . iloc [ : , 1 : ] )
2022-10-13 15:57:56 +11:00
# Fit model
2022-11-29 14:40:50 +11:00
model = model_class ( endog = dmatrices [ 0 ] , exog = dmatrices [ 1 ] , * * model_kwargs )
model . formula = dep + ' ~ ' + formula
2022-10-17 20:34:36 +11:00
result = model . fit ( disp = False , * * fit_kwargs )
2022-10-13 15:57:56 +11:00
2022-10-13 17:23:29 +11:00
if isinstance ( result , RegressionResult ) :
# Already processed!
result . exp = exp
return result
2022-10-17 20:34:36 +11:00
# Check convergence
if hasattr ( result , ' mle_retvals ' ) and not result . mle_retvals [ ' converged ' ] :
warnings . warn ( ' Maximum likelihood estimation failed to converge. Check raw_result.mle_retvals. ' )
2022-10-14 21:57:22 +11:00
# Process terms
terms = { }
2022-11-29 14:40:50 +11:00
# Join term names manually because statsmodels wrapper is very slow
#confint = result.conf_int(config.alpha)
confint = { k : v for k , v in zip ( model . exog_names , result . _results . conf_int ( config . alpha ) ) }
pvalues = { k : v for k , v in zip ( model . exog_names , result . _results . pvalues ) }
2022-10-14 21:57:22 +11:00
2022-11-29 14:40:50 +11:00
#for raw_name, raw_beta in result.params.items():
for raw_name , raw_beta in zip ( model . exog_names , result . _results . params ) :
beta = Estimate ( raw_beta , confint [ raw_name ] [ 0 ] , confint [ raw_name ] [ 1 ] )
2022-10-14 21:57:22 +11:00
# Rename terms
if raw_name == ' Intercept ' :
# Intercept term (single term)
term = ' (Intercept) '
2022-11-29 14:40:50 +11:00
terms [ term ] = SingleTerm ( raw_name , beta , pvalues [ raw_name ] )
2022-12-02 12:53:40 +11:00
elif model_class is OrderedModel and ' / ' in raw_name :
# Ignore ordinal regression intercepts
pass
2022-10-14 21:57:22 +11:00
else :
2022-10-15 01:09:40 +11:00
# Parse if required
factor , column , contrast = parse_patsy_term ( formula , df , raw_name )
if contrast is not None :
# Categorical term
2022-10-15 01:42:06 +11:00
if bool_baselevels is False and contrast == ' True ' and set ( df [ column ] . unique ( ) ) == set ( [ True , False ] ) :
# Treat as single term
2022-11-29 14:40:50 +11:00
terms [ column ] = SingleTerm ( raw_name , beta , pvalues [ raw_name ] )
2022-10-15 01:42:06 +11:00
else :
# Add a new categorical term if not exists
if column not in terms :
2022-11-29 23:00:14 +11:00
ref_category = formula_factor_ref_category ( formula , df , factor )
2022-10-15 01:42:06 +11:00
terms [ column ] = CategoricalTerm ( { } , ref_category )
2022-11-29 14:40:50 +11:00
terms [ column ] . categories [ contrast ] = SingleTerm ( raw_name , beta , pvalues [ raw_name ] )
2022-10-15 01:09:40 +11:00
else :
# Single term
2022-11-29 14:40:50 +11:00
terms [ column ] = SingleTerm ( raw_name , beta , pvalues [ raw_name ] )
2022-10-13 15:57:56 +11:00
2022-12-02 12:53:40 +11:00
# Handle ordinal regression intercepts
#if model_class is OrderedModel:
# intercept_names = [raw_name.split('/')[0] for raw_name in model.exog_names if '/' in raw_name]
# intercepts = model.transform_threshold_params(result._results.params[-len(intercept_names):])
# print(intercepts)
2022-10-13 15:57:56 +11:00
# Fit null model (for llnull)
if hasattr ( result , ' llnull ' ) :
llnull = result . llnull
else :
2022-11-29 14:40:50 +11:00
# Construct null (intercept-only) model
#result_null = model_class.from_formula(formula=dep + ' ~ 1', data=df).fit()
dm_exog = pd . DataFrame ( index = dmatrices [ 0 ] . index )
dm_exog [ ' Intercept ' ] = pd . Series ( dtype = ' float64 ' )
dm_exog [ ' Intercept ' ] . fillna ( 1 , inplace = True )
result_null = model_class ( endog = dmatrices [ 0 ] , exog = dm_exog ) . fit ( )
2022-10-13 15:57:56 +11:00
llnull = result_null . llf
2022-11-29 14:40:50 +11:00
if model_class is sm . OLS :
method_name = ' Least Squares '
else :
# Parse raw regression results to get fit method
# Avoid this in general as it can be expensive to summarise all the post hoc tests, etc.
header_entries = np . vectorize ( str . strip ) ( np . concatenate ( np . split ( np . array ( result . summary ( ) . tables [ 0 ] . data ) , 2 , axis = 1 ) ) )
header_dict = { x [ 0 ] : x [ 1 ] for x in header_entries }
method_name = header_dict [ ' Method: ' ]
2022-10-13 15:57:56 +11:00
2022-10-16 02:31:23 +11:00
# Get full name to display
if model_class is sm . Logit :
full_name = ' Logistic Regression '
else :
full_name = ' {} Regression ' . format ( model_class . __name__ )
if fit_kwargs . get ( ' cov_type ' , ' nonrobust ' ) != ' nonrobust ' :
full_name = ' Robust {} ' . format ( full_name )
2022-12-02 12:53:40 +11:00
comments = [ ]
if model_class is OrderedModel :
comments . append ( ' Cutpoints are omitted from the table of model parameters. ' )
2022-10-13 15:57:56 +11:00
return RegressionResult (
result ,
2022-11-29 14:40:50 +11:00
full_name , model_class . __name__ , method_name ,
2022-12-02 12:53:40 +11:00
dep , result . nobs , result . df_model , datetime . now ( ) , getattr ( result , ' cov_type ' , ' nonrobust ' ) ,
2022-10-14 21:57:22 +11:00
terms ,
2022-10-13 15:57:56 +11:00
result . llf , llnull ,
getattr ( result , ' df_resid ' , None ) , getattr ( result , ' rsquared ' , None ) , getattr ( result , ' fvalue ' , None ) ,
2022-12-02 12:53:40 +11:00
comments ,
2022-10-13 15:57:56 +11:00
exp
)
2022-10-13 17:23:29 +11:00
2022-11-29 23:00:14 +11:00
def regress_bootstrap (
model_class , df , dep , formula , * ,
nan_policy = ' warn ' ,
model_kwargs = None , fit_kwargs = None ,
samples = 1000 ,
* * kwargs
) :
"""
Fit a statsmodels regression model , using bootstrapping to compute confidence intervals and * p * values
: param model_class : See : func : ` yli . regress `
: param df : See : func : ` yli . regress `
: param dep : See : func : ` yli . regress `
: param formula : See : func : ` yli . regress `
: param nan_policy : See : func : ` yli . regress `
: param model_kwargs : See : func : ` yli . regress `
: param fit_kwargs : See : func : ` yli . regress `
: param samples : Number of bootstrap samples to draw
: type samples : int
: param kwargs : Other arguments to pass to : func : ` yli . regress `
: rtype : : class : ` yli . regress . RegressionResult `
"""
if model_kwargs is None :
model_kwargs = { }
if fit_kwargs is None :
fit_kwargs = { }
# Check for/clean NaNs
# Following this, we pass nan_policy='raise' to assert no NaNs remaining
df = df [ [ dep ] + cols_for_formula ( formula , df ) ]
df = check_nan ( df , nan_policy )
# Ensure numeric type for dependent variable
if df [ dep ] . dtype != ' float64 ' :
df [ dep ] = df [ dep ] . astype ( ' float64 ' )
# Convert pandas nullable types for independent variables as this breaks statsmodels
df = convert_pandas_nullable ( df )
# Precompute design matrices
dmatrices = patsy . dmatrices ( dep + ' ~ ' + formula , df , return_type = ' dataframe ' )
# Fit full model
full_model = regress ( model_class , df , dep , formula , nan_policy = ' raise ' , _dmatrices = dmatrices , model_kwargs = model_kwargs , fit_kwargs = fit_kwargs , * * kwargs )
# Initialise bootstrap_results
bootstrap_results = { } # Dict mapping term raw names to bootstrap betas
for term in full_model . terms . values ( ) :
if isinstance ( term , SingleTerm ) :
bootstrap_results [ term . raw_name ] = [ ]
else :
for sub_term in term . categories . values ( ) :
bootstrap_results [ sub_term . raw_name ] = [ ]
# Draw bootstrap samples and regress
dmatrices = dmatrices [ 0 ] . join ( dmatrices [ 1 ] )
for i in tqdm ( range ( samples ) ) :
bootstrap_rows = dmatrices . sample ( len ( df ) , replace = True )
model = model_class ( endog = bootstrap_rows . iloc [ : , 0 ] , exog = bootstrap_rows . iloc [ : , 1 : ] , * * model_kwargs )
model . formula = dep + ' ~ ' + formula
result = model . fit ( disp = False , * * fit_kwargs )
for raw_name , raw_beta in zip ( model . exog_names , result . _results . params ) :
bootstrap_results [ raw_name ] . append ( raw_beta )
# Combine bootstrap results
terms = { }
for term_name , term in full_model . terms . items ( ) :
if isinstance ( term , SingleTerm ) :
bootstrap_betas = bootstrap_results [ term . raw_name ]
bootstrap_pvalue = sum ( 1 for b in bootstrap_betas if b < 0 ) / len ( bootstrap_betas )
bootstrap_pvalue = 2 * min ( bootstrap_pvalue , 1 - bootstrap_pvalue )
terms [ term_name ] = SingleTerm ( term . raw_name , Estimate ( term . beta . point , np . quantile ( bootstrap_betas , config . alpha / 2 ) , np . quantile ( bootstrap_betas , 1 - config . alpha / 2 ) ) , bootstrap_pvalue )
else :
categories = { }
for sub_term_name , sub_term in term . categories . items ( ) :
bootstrap_betas = bootstrap_results [ sub_term . raw_name ]
bootstrap_pvalue = sum ( 1 for b in bootstrap_betas if b < 0 ) / len ( bootstrap_betas )
bootstrap_pvalue = 2 * min ( bootstrap_pvalue , 1 - bootstrap_pvalue )
categories [ sub_term_name ] = SingleTerm ( sub_term . raw_name , Estimate ( sub_term . beta . point , np . quantile ( bootstrap_betas , config . alpha / 2 ) , np . quantile ( bootstrap_betas , 1 - config . alpha / 2 ) ) , bootstrap_pvalue )
terms [ term_name ] = CategoricalTerm ( categories , term . ref_category )
return RegressionResult (
None ,
full_model . full_name , full_model . model_name , full_model . fit_method ,
dep , full_model . nobs , full_model . dof_model , datetime . now ( ) , ' Bootstrap ' ,
terms ,
full_model . llf , full_model . llnull ,
full_model . dof_resid , full_model . rsquared , full_model . f_statistic ,
2022-12-02 12:53:40 +11:00
full_model . comments ,
2022-11-29 23:00:14 +11:00
full_model . exp
)
2022-10-16 02:31:37 +11:00
def logit_then_regress ( model_class , df , dep , formula , * , nan_policy = ' warn ' , * * kwargs ) :
2022-10-17 21:41:19 +11:00
"""
Perform logistic regression , then use parameters as start parameters for desired regression
: param model_class : Type of regression model to fit
: type model_class : statsmodels model class
: param df : Data to perform regression on
: type df : DataFrame
: param dep : Column in * df * for the dependent variable ( numeric )
: type dep : str
: param formula : Patsy formula for the regression model
: type formula : str
: param nan_policy : How to handle * nan * values ( see : ref : ` nan - handling ` )
: type nan_policy : str
: param kwargs : Passed through to : func : ` yli . regress `
: rtype : : class : ` yli . regress . RegressionResult `
"""
2022-10-16 02:31:37 +11:00
# Check for/clean NaNs
# Do this once here so we only get 1 warning
df = df [ [ dep ] + cols_for_formula ( formula , df ) ]
df = check_nan ( df , nan_policy )
# Perform logistic regression
logit_result = regress ( sm . Logit , df , dep , formula , * * kwargs )
logit_params = logit_result . raw_result . params
2022-10-17 20:34:36 +11:00
# Check convergence
if not logit_result . raw_result . mle_retvals [ ' converged ' ] :
return None
2022-10-16 02:31:37 +11:00
# Perform desired regression
return regress ( model_class , df , dep , formula , start_params = logit_params , * * kwargs )
2022-10-13 17:23:29 +11:00
# -----------------------------
# Penalised logistic regression
class PenalisedLogit ( statsmodels . discrete . discrete_model . BinaryModel ) :
"""
2022-10-17 21:41:19 +11:00
statsmodel - compatible model for computing Firth penalised logistic regression
Uses the R * logistf * library .
2022-10-13 17:23:29 +11:00
2022-10-17 21:41:19 +11:00
This class should be used in conjunction with : func : ` yli . regress ` .
2022-10-18 19:25:12 +11:00
* * Example : * *
. . code - block : :
df = pd . DataFrame ( {
' Pred ' : [ 1 ] * 20 + [ 0 ] * 220 ,
' Outcome ' : [ 1 ] * 40 + [ 0 ] * 200
} )
yli . regress ( yli . PenalisedLogit , df , ' Outcome ' , ' Pred ' , exp = False )
. . code - block : : text
Penalised Logistic Regression Results
== == == == == == == == == == == == == == == == == == == == == == == == == == == == =
Dep . Variable : Outcome | No . Observations : 240
Model : Logit | Df . Model : 1
Method : Penalised ML | Pseudo R² : 0.37
Date : 2022 - 10 - 19 | LL - Model : - 66.43
Time : 07 : 50 : 40 | LL - Null : - 105.91
Std . Errors : Non - Robust | p ( LR ) : < 0.001 *
== == == == == == == == == == == == == == == == == == == == == == == == == == == == =
β ( 95 % CI ) p
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
( Intercept ) - 2.28 ( - 2.77 - - 1.85 ) < 0.001 *
Pred 5.99 ( 3.95 - 10.85 ) < 0.001 *
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
The output summarises the result of the regression .
The summary table shows that a penalised method was applied .
Note that because ` exp = False ` was passed , the parameter estimates are not automatically exponentiated .
2022-10-13 17:23:29 +11:00
"""
2022-10-17 20:34:58 +11:00
def fit ( self , disp = False ) :
2022-10-13 17:23:29 +11:00
import rpy2 . robjects as ro
import rpy2 . robjects . packages
import rpy2 . robjects . pandas2ri
# Assume data is already cleaned from regress()
2022-11-29 14:40:50 +11:00
df = self . data . orig_endog . join ( self . data . orig_exog )
2022-10-13 17:23:29 +11:00
# Convert bool to int otherwise rpy2 chokes
df = df . replace ( { False : 0 , True : 1 } )
# Import logistf
ro . packages . importr ( ' logistf ' )
with ro . conversion . localconverter ( ro . default_converter + ro . pandas2ri . converter ) :
with ro . local_context ( ) as lc :
# Convert DataFrame to R
lc [ ' df ' ] = df
# Transfer other parameters to R
lc [ ' formula_ ' ] = self . formula
2022-10-15 23:24:29 +11:00
lc [ ' alpha ' ] = config . alpha
2022-10-13 17:23:29 +11:00
# Fit the model
2022-10-15 23:24:29 +11:00
model = ro . r ( ' logistf(formula_, data=df, alpha=alpha) ' )
2022-10-13 17:23:29 +11:00
2022-10-14 21:57:22 +11:00
# TODO: Handle categorical terms?
terms = { t : SingleTerm ( t , Estimate ( b , ci0 , ci1 ) , p ) for t , b , ci0 , ci1 , p in zip ( model [ ' terms ' ] , model [ ' coefficients ' ] , model [ ' ci.lower ' ] , model [ ' ci.upper ' ] , model [ ' prob ' ] ) }
2022-10-13 17:23:29 +11:00
return RegressionResult (
model ,
' Penalised Logistic Regression ' , ' Logit ' , ' Penalised ML ' ,
2022-10-17 20:34:58 +11:00
self . endog_names , model [ ' n ' ] [ 0 ] , model [ ' df ' ] [ 0 ] , datetime . now ( ) , ' nonrobust ' ,
2022-10-14 21:57:22 +11:00
terms ,
2022-10-13 17:23:29 +11:00
model [ ' loglik ' ] [ 0 ] , model [ ' loglik ' ] [ 1 ] ,
None , None , None ,
2022-12-02 12:53:40 +11:00
[ ] ,
2022-10-13 17:23:29 +11:00
None # Set exp in regress()
)
2022-12-02 12:53:40 +11:00