2022-10-13 15:57:56 +11:00
# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import numpy as np
2022-10-13 16:24:01 +11:00
import pandas as pd
2022-10-13 15:57:56 +11:00
from scipy import stats
2022-10-13 17:23:29 +11:00
import statsmodels
2022-10-13 15:57:56 +11:00
import statsmodels . api as sm
from statsmodels . iolib . table import SimpleTable
2022-10-13 16:24:01 +11:00
from statsmodels . stats . outliers_influence import variance_inflation_factor
2022-10-13 15:57:56 +11:00
from datetime import datetime
import itertools
2022-10-14 14:09:29 +11:00
from . bayes_factors import BayesFactor , bayesfactor_afbf
2022-10-15 23:24:29 +11:00
from . config import config
2022-10-14 20:18:25 +11:00
from . sig_tests import FTestResult
2022-10-15 01:09:40 +11:00
from . utils import Estimate , check_nan , cols_for_formula , fmt_p , formula_factor_ref_category , parse_patsy_term
2022-10-13 15:57:56 +11:00
2022-10-17 21:41:19 +11:00
def vif ( df , formula = None , * , nan_policy = ' warn ' ) :
2022-10-14 14:08:51 +11:00
"""
2022-10-17 21:41:19 +11:00
Calculate the variance inflation factor ( VIF ) for each variable in * df *
2022-10-14 14:08:51 +11:00
2022-10-17 21:41:19 +11:00
: param df : Data to calculate the VIF for
: type df : DataFrame
: param formula : If specified , calculate the VIF only for the variables in the formula
: type formula : str
: return : The variance inflation factor
: rtype : float
2022-10-14 14:08:51 +11:00
"""
if formula :
# Only consider columns in the formula
2022-10-15 01:09:40 +11:00
df = df [ cols_for_formula ( formula , df ) ]
2022-10-13 16:24:01 +11:00
# Check for/clean NaNs
df = check_nan ( df , nan_policy )
# Convert all to float64 otherwise statsmodels chokes with "ufunc 'isfinite' not supported for the input types ..."
df = pd . get_dummies ( df , drop_first = True ) # Convert categorical dtypes
df = df . astype ( ' float64 ' ) # Convert all other dtypes
# Add intercept column
orig_columns = list ( df . columns )
df [ ' Intercept ' ] = [ 1 ] * len ( df )
vifs = { }
for i , col in enumerate ( orig_columns ) :
vifs [ col ] = variance_inflation_factor ( df , i )
return pd . Series ( vifs )
2022-10-13 15:57:56 +11:00
# ----------
# Regression
class LikelihoodRatioTestResult :
2022-10-17 21:41:19 +11:00
"""
Result of a likelihood ratio test for regression
See : meth : ` RegressionResult . lrtest_null ` .
"""
2022-10-13 15:57:56 +11:00
def __init__ ( self , statistic , dof , pvalue ) :
2022-10-17 21:41:19 +11:00
#: Likelihood ratio test statistic (*float*)
2022-10-13 15:57:56 +11:00
self . statistic = statistic
2022-10-17 21:41:19 +11:00
#: Degrees of freedom for the likelihood ratio test statistic (*int*)
2022-10-13 15:57:56 +11:00
self . dof = dof
2022-10-17 21:41:19 +11:00
#: *p* value for the likelihood ratio test (*float*)
2022-10-13 15:57:56 +11:00
self . pvalue = pvalue
2022-10-18 18:44:04 +11:00
def __repr__ ( self ) :
if config . repr_is_summary :
return self . summary ( )
return super ( ) . __repr__ ( )
2022-10-13 15:57:56 +11:00
def _repr_html_ ( self ) :
2022-10-14 14:48:26 +11:00
return ' LR( {} ) = {:.2f} ; <i>p</i> {} ' . format ( self . dof , self . statistic , fmt_p ( self . pvalue , html = True ) )
2022-10-13 15:57:56 +11:00
def summary ( self ) :
2022-10-17 21:41:19 +11:00
"""
Return a stringified summary of the likelihood ratio test
: rtype : str
"""
2022-10-14 14:48:26 +11:00
return ' LR( {} ) = {:.2f} ; p {} ' . format ( self . dof , self . statistic , fmt_p ( self . pvalue , html = False ) )
2022-10-13 15:57:56 +11:00
class RegressionResult :
"""
Result of a regression
2022-10-17 21:41:19 +11:00
See : func : ` yli . regress ` .
2022-10-13 15:57:56 +11:00
"""
def __init__ ( self ,
raw_result ,
full_name , model_name , fit_method ,
2022-10-17 20:34:58 +11:00
dep , nobs , dof_model , fitted_dt , cov_type ,
2022-10-14 21:57:22 +11:00
terms ,
2022-10-13 15:57:56 +11:00
llf , llnull ,
dof_resid , rsquared , f_statistic ,
exp
) :
2022-10-17 21:41:19 +11:00
#: Raw result from statsmodels *model.fit*
2022-10-13 15:57:56 +11:00
self . raw_result = raw_result
# Information for display
2022-10-17 21:41:19 +11:00
#: Full name of the regression model type (*str*)
2022-10-13 15:57:56 +11:00
self . full_name = full_name
2022-10-17 21:41:19 +11:00
#: Short name of the regression model type (*str*)
2022-10-13 15:57:56 +11:00
self . model_name = model_name
2022-10-17 21:41:19 +11:00
#: Method for fitting the regression model (*str*)
2022-10-13 15:57:56 +11:00
self . fit_method = fit_method
# Basic fitted model information
2022-10-17 21:41:19 +11:00
#: Name of the dependent variable (*str*)
2022-10-13 15:57:56 +11:00
self . dep = dep
2022-10-17 21:41:19 +11:00
#: Number of observations (*int*)
2022-10-13 15:57:56 +11:00
self . nobs = nobs
2022-10-17 21:41:19 +11:00
#: Degrees of freedom for the model (*int*)
2022-10-13 15:57:56 +11:00
self . dof_model = dof_model
2022-10-17 21:41:19 +11:00
#: Date and time of fitting the model (Python *datetime*)
2022-10-13 15:57:56 +11:00
self . fitted_dt = fitted_dt
2022-10-17 21:41:19 +11:00
#: Method for computing the covariance matrix (*str*)
2022-10-17 20:34:58 +11:00
self . cov_type = cov_type
2022-10-13 15:57:56 +11:00
2022-10-14 21:57:22 +11:00
# Regression coefficients/p values
2022-10-17 21:41:19 +11:00
#: Coefficients and *p* values for each term in the model (*dict* of :class:`SingleTerm` or :class:`CategoricalTerm`)
2022-10-14 21:57:22 +11:00
self . terms = terms
2022-10-13 15:57:56 +11:00
# Model log-likelihood
2022-10-17 21:41:19 +11:00
#: Log-likelihood of fitted model (*float*)
2022-10-13 15:57:56 +11:00
self . llf = llf
2022-10-17 21:41:19 +11:00
#: Log-likelihood of null model (*float*)
2022-10-13 15:57:56 +11:00
self . llnull = llnull
# Extra statistics (not all regression models have these)
2022-10-17 21:41:19 +11:00
#: Degrees of freedom for the residuals (*int*; *None* if N/A)
2022-10-13 15:57:56 +11:00
self . dof_resid = dof_resid
2022-10-17 21:41:19 +11:00
#: *R*:sup:`2` statistic (*float*; *None* if N/A)
2022-10-13 15:57:56 +11:00
self . rsquared = rsquared
2022-10-17 21:41:19 +11:00
#: *F* statistic (*float*; *None* if N/A)
2022-10-13 15:57:56 +11:00
self . f_statistic = f_statistic
# Config for display style
2022-10-17 21:41:19 +11:00
#: See :func:`yli.regress`
2022-10-13 15:57:56 +11:00
self . exp = exp
@property
def pseudo_rsquared ( self ) :
2022-10-17 21:41:19 +11:00
""" McFadden ' s pseudo *R*:sup:`2` statistic """
2022-10-13 15:57:56 +11:00
return 1 - self . llf / self . llnull
def lrtest_null ( self ) :
2022-10-17 21:41:19 +11:00
"""
Compute the likelihood ratio test comparing the model to the null model
: rtype : : class : ` LikelihoodRatioTestResult `
"""
2022-10-13 15:57:56 +11:00
statistic = - 2 * ( self . llnull - self . llf )
pvalue = 1 - stats . chi2 . cdf ( statistic , self . dof_model )
return LikelihoodRatioTestResult ( statistic , self . dof_model , pvalue )
def ftest ( self ) :
2022-10-17 21:41:19 +11:00
"""
Perform the * F * test that all slopes are 0
: rtype : : class : ` yli . sig_tests . FTestResult `
"""
2022-10-13 15:57:56 +11:00
pvalue = 1 - stats . f ( self . dof_model , self . dof_resid ) . cdf ( self . f_statistic )
return FTestResult ( self . f_statistic , self . dof_model , self . dof_resid , pvalue )
2022-10-14 14:09:29 +11:00
def bayesfactor_beta_zero ( self , term ) :
"""
2022-10-17 21:41:19 +11:00
Compute a Bayes factor testing the hypothesis that the given beta is 0
2022-10-16 02:49:43 +11:00
2022-10-17 21:41:19 +11:00
Uses the R * BFpack * library .
Requires the regression to be from statsmodels .
The term must be specified as the * raw name * from the statsmodels regression , available via : attr : ` RegressionResult . raw_result ` .
: param term : Raw name of the term to be tested
: type term : str
: rtype : : class : ` yli . bayes_factors . BayesFactor `
2022-10-14 14:09:29 +11:00
"""
2022-10-16 02:49:43 +11:00
# FIXME: Allow specifying our renamed terms
2022-10-14 14:09:29 +11:00
# Get parameters required for AFBF
2022-10-16 02:49:43 +11:00
params = pd . Series ( { raw_name . replace ( ' [ ' , ' _ ' ) . replace ( ' ] ' , ' _ ' ) : beta for raw_name , beta in self . raw_result . params . items ( ) } )
2022-10-14 14:09:29 +11:00
cov = self . raw_result . cov_params ( )
# Compute BF matrix
bf01 = bayesfactor_afbf ( params , cov , self . nobs , ' {} = 0 ' . format ( term . replace ( ' [ ' , ' _ ' ) . replace ( ' ] ' , ' _ ' ) ) )
bf01 = BayesFactor ( bf01 . factor , ' 0 ' , ' {} = 0 ' . format ( term ) , ' 1 ' , ' {} ≠ 0 ' . format ( term ) )
if bf01 . factor > = 1 :
return bf01
else :
return bf01 . invert ( )
2022-10-13 15:57:56 +11:00
def _header_table ( self , html ) :
""" Return the entries for the header table """
# Left column
left_col = [ ]
left_col . append ( ( ' Dep. Variable: ' , self . dep ) )
left_col . append ( ( ' Model: ' , self . model_name ) )
left_col . append ( ( ' Method: ' , self . fit_method ) )
left_col . append ( ( ' Date: ' , self . fitted_dt . strftime ( ' % Y- % m- %d ' ) ) )
left_col . append ( ( ' Time: ' , self . fitted_dt . strftime ( ' % H: % M: % S ' ) ) )
2022-10-17 20:34:58 +11:00
left_col . append ( ( ' Std. Errors: ' , ' Non-Robust ' if self . cov_type == ' nonrobust ' else self . cov_type . upper ( ) ) )
2022-10-13 15:57:56 +11:00
# Right column
right_col = [ ]
2022-10-17 20:34:58 +11:00
right_col . append ( ( ' No. Observations: ' , format ( self . nobs , ' .0f ' ) ) )
2022-10-16 02:30:24 +11:00
right_col . append ( ( ' Df. Model: ' , format ( self . dof_model , ' .0f ' ) ) )
2022-10-13 15:57:56 +11:00
if self . dof_resid :
right_col . append ( ( ' Df. Residuals: ' , format ( self . dof_resid , ' .0f ' ) ) )
if self . rsquared :
right_col . append ( ( ' <i>R</i><sup>2</sup>: ' if html else ' R²: ' , format ( self . rsquared , ' .2f ' ) ) )
else :
right_col . append ( ( ' Pseudo <i>R</i><sup>2</sup>: ' if html else ' Pseudo R²: ' , format ( self . pseudo_rsquared , ' .2f ' ) ) )
if self . f_statistic :
# Report the F test if available
f_result = self . ftest ( )
if html :
right_col . append ( ( ' <i>F</i>: ' , format ( f_result . statistic , ' .2f ' ) ) )
2022-10-15 23:30:41 +11:00
right_col . append ( ( ' <i>p</i> (<i>F</i>): ' , fmt_p ( f_result . pvalue , html = True , tabular = True ) ) )
2022-10-13 15:57:56 +11:00
else :
right_col . append ( ( ' F: ' , format ( f_result . statistic , ' .2f ' ) ) )
2022-10-15 23:30:41 +11:00
right_col . append ( ( ' p (F): ' , fmt_p ( f_result . pvalue , html = False , tabular = True ) ) )
2022-10-13 15:57:56 +11:00
else :
# Otherwise report likelihood ratio test as overall test
lrtest_result = self . lrtest_null ( )
right_col . append ( ( ' LL-Model: ' , format ( self . llf , ' .2f ' ) ) )
right_col . append ( ( ' LL-Null: ' , format ( self . llnull , ' .2f ' ) ) )
if html :
2022-10-15 23:30:41 +11:00
right_col . append ( ( ' <i>p</i> (LR): ' , fmt_p ( lrtest_result . pvalue , html = True , tabular = True ) ) )
2022-10-13 15:57:56 +11:00
else :
2022-10-15 23:30:41 +11:00
right_col . append ( ( ' p (LR): ' , fmt_p ( lrtest_result . pvalue , html = False , tabular = True ) ) )
2022-10-13 15:57:56 +11:00
return left_col , right_col
2022-10-18 18:44:04 +11:00
def __repr__ ( self ) :
if config . repr_is_summary :
return self . summary ( )
return super ( ) . __repr__ ( )
2022-10-13 15:57:56 +11:00
def _repr_html_ ( self ) :
# Render header table
left_col , right_col = self . _header_table ( html = True )
out = ' <table><caption> {} Results</caption> ' . format ( self . full_name )
for left_cell , right_cell in itertools . zip_longest ( left_col , right_col ) :
out + = ' <tr><th> {} </th><td> {} </td><th> {} </th><td> {} </td></tr> ' . format (
left_cell [ 0 ] if left_cell else ' ' ,
left_cell [ 1 ] if left_cell else ' ' ,
right_cell [ 0 ] if right_cell else ' ' ,
right_cell [ 1 ] if right_cell else ' '
)
out + = ' </table> '
# Render coefficients table
2022-10-15 23:24:29 +11:00
out + = ' <table><tr><th></th><th style= " text-align:center " > {} </th><th colspan= " 3 " style= " text-align:center " >( {:g} % CI)</th><th style= " text-align:center " ><i>p</i></th></tr> ' . format ( ' exp(<i>β</i>) ' if self . exp else ' <i>β</i> ' , ( 1 - config . alpha ) * 100 )
2022-10-13 15:57:56 +11:00
2022-10-14 21:57:22 +11:00
for term_name , term in self . terms . items ( ) :
if isinstance ( term , SingleTerm ) :
# Single term
# Exponentiate beta if requested
beta = term . beta
if self . exp :
beta = np . exp ( beta )
2022-10-15 23:30:41 +11:00
out + = ' <tr><th> {} </th><td> {:.2f} </td><td style= " padding-right:0 " >( {:.2f} </td><td>–</td><td style= " padding-left:0 " > {:.2f} )</td><td style= " text-align:left " > {} </td></tr> ' . format ( term_name , beta . point , beta . ci_lower , beta . ci_upper , fmt_p ( term . pvalue , html = True , tabular = True ) )
2022-10-14 21:57:22 +11:00
elif isinstance ( term , CategoricalTerm ) :
# Categorical term
out + = ' <tr><th> {} </th><td></td><td style= " padding-right:0 " ></td><td></td><td style= " padding-left:0 " ></td><td></td></tr> ' . format ( term_name )
# Render reference category
out + = ' <tr><td style= " text-align:right;font-style:italic " > {} </td><td>Ref.</td><td style= " padding-right:0 " ></td><td></td><td style= " padding-left:0 " ></td><td></td></tr> ' . format ( term . ref_category )
# Loop over terms
for sub_term_name , sub_term in term . categories . items ( ) :
# Exponentiate beta if requested
beta = sub_term . beta
if self . exp :
beta = np . exp ( beta )
2022-10-15 23:30:41 +11:00
out + = ' <tr><td style= " text-align:right;font-style:italic " > {} </td><td> {:.2f} </td><td style= " padding-right:0 " >( {:.2f} </td><td>–</td><td style= " padding-left:0 " > {:.2f} )</td><td style= " text-align:left " > {} </td></tr> ' . format ( sub_term_name , beta . point , beta . ci_lower , beta . ci_upper , fmt_p ( sub_term . pvalue , html = True , tabular = True ) )
2022-10-14 21:57:22 +11:00
else :
raise Exception ( ' Attempt to render unknown term type ' )
2022-10-13 15:57:56 +11:00
out + = ' </table> '
# TODO: Have a detailed view which shows SE, t/z, etc.
return out
def summary ( self ) :
2022-10-17 21:41:19 +11:00
"""
Return a stringified summary of the regression model
: rtype : str
"""
2022-10-13 15:57:56 +11:00
# Render header table
left_col , right_col = self . _header_table ( html = False )
# Ensure equal sizes for SimpleTable
if len ( right_col ) > len ( left_col ) :
left_col . extend ( [ ( ' ' , ' ' ) ] * ( len ( right_col ) - len ( left_col ) ) )
elif len ( left_col ) > len ( right_col ) :
right_col . extend ( [ ( ' ' , ' ' ) ] * ( len ( left_col ) - len ( right_col ) ) )
table1 = SimpleTable ( np . concatenate ( [ left_col , right_col ] , axis = 1 ) , title = ' {} Results ' . format ( self . full_name ) )
table1 . insert_stubs ( 2 , [ ' | ' ] * len ( left_col ) )
# Get rid of last line (merge with next table)
table1_lines = table1 . as_text ( ) . splitlines ( keepends = False )
out = ' \n ' . join ( table1_lines [ : - 1 ] ) + ' \n '
# Render coefficients table
table_data = [ ]
2022-10-14 21:57:22 +11:00
for term_name , term in self . terms . items ( ) :
if isinstance ( term , SingleTerm ) :
# Single term
# Exponentiate beta if requested
beta = term . beta
if self . exp :
beta = np . exp ( beta )
# Add some extra padding
2022-10-15 23:30:41 +11:00
table_data . append ( [ term_name + ' ' , format ( beta . point , ' .2f ' ) , ' ( {:.2f} ' . format ( beta . ci_lower ) , ' - ' , ' {:.2f} ) ' . format ( beta . ci_upper ) , ' ' + fmt_p ( term . pvalue , html = False , tabular = True ) ] )
2022-10-14 21:57:22 +11:00
elif isinstance ( term , CategoricalTerm ) :
# Categorical term
table_data . append ( [ term_name + ' ' , ' ' , ' ' , ' ' , ' ' , ' ' ] )
# Render reference category
table_data . append ( [ ' {} ' . format ( term . ref_category ) , ' Ref. ' , ' ' , ' ' , ' ' , ' ' ] )
# Loop over terms
for sub_term_name , sub_term in term . categories . items ( ) :
# Exponentiate beta if requested
beta = sub_term . beta
if self . exp :
beta = np . exp ( beta )
2022-10-15 23:30:41 +11:00
table_data . append ( [ sub_term_name + ' ' , format ( beta . point , ' .2f ' ) , ' ( {:.2f} ' . format ( beta . ci_lower ) , ' - ' , ' {:.2f} ) ' . format ( beta . ci_upper ) , ' ' + fmt_p ( sub_term . pvalue , html = False , tabular = True ) ] )
2022-10-14 21:57:22 +11:00
else :
raise Exception ( ' Attempt to render unknown term type ' )
2022-10-13 15:57:56 +11:00
table2 = SimpleTable ( data = table_data , headers = [ ' ' , ' exp(β) ' if self . exp else ' β ' , ' ' , ' \ue000 ' , ' ' , ' p ' ] ) # U+E000 is in Private Use Area, mark middle of CI column
2022-10-15 23:24:29 +11:00
table2_text = table2 . as_text ( ) . replace ( ' \ue000 ' , ' ( {:g} % CI) ' . format ( ( 1 - config . alpha ) * 100 ) ) # Render heading in the right spot
2022-10-13 15:57:56 +11:00
table2_lines = table2_text . splitlines ( keepends = False )
# Render divider line between 2 tables
max_table_len = max ( len ( table1_lines [ - 1 ] ) , len ( table2_lines [ - 1 ] ) )
out + = ' = ' * max_table_len + ' \n '
out + = ' \n ' . join ( table2_lines [ 1 : ] )
return out
2022-10-14 21:57:22 +11:00
class SingleTerm :
2022-10-17 21:41:19 +11:00
""" A term in a :class:`RegressionResult` which is a single term """
2022-10-14 21:57:22 +11:00
def __init__ ( self , raw_name , beta , pvalue ) :
2022-10-17 21:41:19 +11:00
#: Raw name of the term (*str*; e.g. in :attr:`RegressionResult.raw_result`)
2022-10-14 21:57:22 +11:00
self . raw_name = raw_name
2022-10-17 21:41:19 +11:00
#: :class:`yli.utils.Estimate` of the coefficient
2022-10-14 21:57:22 +11:00
self . beta = beta
2022-10-17 21:41:19 +11:00
#: *p* value for the coefficient (*float*)
2022-10-14 21:57:22 +11:00
self . pvalue = pvalue
class CategoricalTerm :
2022-10-17 21:41:19 +11:00
""" A group of terms in a :class:`RegressionResult` corresponding to a categorical variable """
2022-10-14 21:57:22 +11:00
def __init__ ( self , categories , ref_category ) :
2022-10-17 21:41:19 +11:00
#: Terms for each of the categories, excluding the reference category (*dict* of :class:`SingleTerm`)
2022-10-14 21:57:22 +11:00
self . categories = categories
2022-10-17 21:41:19 +11:00
#: Name of the reference category (*str*)
2022-10-14 21:57:22 +11:00
self . ref_category = ref_category
2022-10-13 15:57:56 +11:00
def regress (
model_class , df , dep , formula , * ,
2022-10-15 01:42:06 +11:00
nan_policy = ' warn ' ,
2022-10-16 02:31:08 +11:00
model_kwargs = None , fit_kwargs = None ,
family = None , # common model_kwargs
cov_type = None , maxiter = None , start_params = None , # common fit_kwargs
2022-10-15 01:42:06 +11:00
bool_baselevels = False , exp = None
2022-10-13 15:57:56 +11:00
) :
2022-10-15 01:42:06 +11:00
"""
Fit a statsmodels regression model
2022-10-17 21:41:19 +11:00
: param model_class : Type of regression model to fit
: type model_class : statsmodels model class
: param df : Data to perform regression on
: type df : DataFrame
: param dep : Column in * df * for the dependent variable ( numeric )
: type dep : str
: param formula : Patsy formula for the regression model
: type formula : str
: param nan_policy : How to handle * nan * values ( see : ref : ` nan - handling ` )
: type nan_policy : str
: param model_kwargs : Keyword arguments to pass to * model_class * constructor
: type model_kwargs : dict
: param fit_kwargs : Keyword arguments to pass to * model . fit *
: type fit_kwargs : dict
: param family : See statsmodels * GLM * constructor
: param cov_type : See statsmodels * model . fit *
: param maxiter : See statsmodels * model . fit *
: param start_params : See statsmodels * model . fit *
: param bool_baselevels : Show reference categories for boolean independent variables even if reference category is * False *
: type bool_baselevels : bool
: param exp : Report exponentiated parameters rather than raw parameters , default ( * None * ) is to autodetect based on * model_class *
: type exp : bool
: rtype : : class : ` yli . regress . RegressionResult `
2022-10-15 01:42:06 +11:00
"""
2022-10-13 15:57:56 +11:00
2022-10-16 02:31:08 +11:00
# Populate model_kwargs
if model_kwargs is None :
model_kwargs = { }
if family is not None :
model_kwargs [ ' family ' ] = family
# Populate fit_kwargs
if fit_kwargs is None :
fit_kwargs = { }
if cov_type is not None :
fit_kwargs [ ' cov_type ' ] = cov_type
if maxiter is not None :
fit_kwargs [ ' maxiter ' ] = maxiter
if start_params is not None :
fit_kwargs [ ' start_params ' ] = start_params
2022-10-13 15:57:56 +11:00
# Autodetect whether to exponentiate
if exp is None :
2022-10-16 02:31:08 +11:00
if model_class in ( sm . Logit , sm . Poisson , PenalisedLogit ) :
2022-10-13 15:57:56 +11:00
exp = True
else :
exp = False
# Check for/clean NaNs
2022-10-15 01:09:40 +11:00
df = df [ [ dep ] + cols_for_formula ( formula , df ) ]
2022-10-13 15:57:56 +11:00
df = check_nan ( df , nan_policy )
# Ensure numeric type for dependent variable
if df [ dep ] . dtype != ' float64 ' :
df [ dep ] = df [ dep ] . astype ( ' float64 ' )
# Convert pandas nullable types for independent variables
for col in df . columns :
if df [ col ] . dtype == ' Int64 ' :
df [ col ] = df [ col ] . astype ( ' float64 ' )
# Fit model
2022-10-16 02:31:08 +11:00
model = model_class . from_formula ( formula = dep + ' ~ ' + formula , data = df , * * model_kwargs )
2022-10-17 20:34:36 +11:00
result = model . fit ( disp = False , * * fit_kwargs )
2022-10-13 15:57:56 +11:00
2022-10-13 17:23:29 +11:00
if isinstance ( result , RegressionResult ) :
# Already processed!
result . exp = exp
return result
2022-10-17 20:34:36 +11:00
# Check convergence
if hasattr ( result , ' mle_retvals ' ) and not result . mle_retvals [ ' converged ' ] :
warnings . warn ( ' Maximum likelihood estimation failed to converge. Check raw_result.mle_retvals. ' )
2022-10-14 21:57:22 +11:00
# Process terms
terms = { }
2022-10-15 23:24:29 +11:00
confint = result . conf_int ( config . alpha )
2022-10-14 21:57:22 +11:00
for raw_name , raw_beta in result . params . items ( ) :
beta = Estimate ( raw_beta , confint [ 0 ] [ raw_name ] , confint [ 1 ] [ raw_name ] )
# Rename terms
if raw_name == ' Intercept ' :
# Intercept term (single term)
term = ' (Intercept) '
terms [ term ] = SingleTerm ( raw_name , beta , result . pvalues [ raw_name ] )
else :
2022-10-15 01:09:40 +11:00
# Parse if required
factor , column , contrast = parse_patsy_term ( formula , df , raw_name )
if contrast is not None :
# Categorical term
2022-10-15 01:42:06 +11:00
if bool_baselevels is False and contrast == ' True ' and set ( df [ column ] . unique ( ) ) == set ( [ True , False ] ) :
# Treat as single term
terms [ column ] = SingleTerm ( raw_name , beta , result . pvalues [ raw_name ] )
else :
# Add a new categorical term if not exists
if column not in terms :
ref_category = formula_factor_ref_category ( formula , df , factor )
terms [ column ] = CategoricalTerm ( { } , ref_category )
terms [ column ] . categories [ contrast ] = SingleTerm ( raw_name , beta , result . pvalues [ raw_name ] )
2022-10-15 01:09:40 +11:00
else :
# Single term
2022-10-15 01:42:06 +11:00
terms [ column ] = SingleTerm ( raw_name , beta , result . pvalues [ raw_name ] )
2022-10-13 15:57:56 +11:00
# Fit null model (for llnull)
if hasattr ( result , ' llnull ' ) :
llnull = result . llnull
else :
result_null = model_class . from_formula ( formula = dep + ' ~ 1 ' , data = df ) . fit ( )
llnull = result_null . llf
# Parse raw regression results (to get fit method)
header_entries = np . vectorize ( str . strip ) ( np . concatenate ( np . split ( np . array ( result . summary ( ) . tables [ 0 ] . data ) , 2 , axis = 1 ) ) )
header_dict = { x [ 0 ] : x [ 1 ] for x in header_entries }
2022-10-16 02:31:23 +11:00
# Get full name to display
if model_class is sm . Logit :
full_name = ' Logistic Regression '
else :
full_name = ' {} Regression ' . format ( model_class . __name__ )
if fit_kwargs . get ( ' cov_type ' , ' nonrobust ' ) != ' nonrobust ' :
full_name = ' Robust {} ' . format ( full_name )
2022-10-13 15:57:56 +11:00
return RegressionResult (
result ,
2022-10-16 02:31:23 +11:00
full_name , model_class . __name__ , header_dict [ ' Method: ' ] ,
2022-10-17 20:34:58 +11:00
dep , result . nobs , result . df_model , datetime . now ( ) , result . cov_type ,
2022-10-14 21:57:22 +11:00
terms ,
2022-10-13 15:57:56 +11:00
result . llf , llnull ,
getattr ( result , ' df_resid ' , None ) , getattr ( result , ' rsquared ' , None ) , getattr ( result , ' fvalue ' , None ) ,
exp
)
2022-10-13 17:23:29 +11:00
2022-10-16 02:31:37 +11:00
def logit_then_regress ( model_class , df , dep , formula , * , nan_policy = ' warn ' , * * kwargs ) :
2022-10-17 21:41:19 +11:00
"""
Perform logistic regression , then use parameters as start parameters for desired regression
: param model_class : Type of regression model to fit
: type model_class : statsmodels model class
: param df : Data to perform regression on
: type df : DataFrame
: param dep : Column in * df * for the dependent variable ( numeric )
: type dep : str
: param formula : Patsy formula for the regression model
: type formula : str
: param nan_policy : How to handle * nan * values ( see : ref : ` nan - handling ` )
: type nan_policy : str
: param kwargs : Passed through to : func : ` yli . regress `
: rtype : : class : ` yli . regress . RegressionResult `
"""
2022-10-16 02:31:37 +11:00
# Check for/clean NaNs
# Do this once here so we only get 1 warning
df = df [ [ dep ] + cols_for_formula ( formula , df ) ]
df = check_nan ( df , nan_policy )
# Perform logistic regression
logit_result = regress ( sm . Logit , df , dep , formula , * * kwargs )
logit_params = logit_result . raw_result . params
2022-10-17 20:34:36 +11:00
# Check convergence
if not logit_result . raw_result . mle_retvals [ ' converged ' ] :
return None
2022-10-16 02:31:37 +11:00
# Perform desired regression
return regress ( model_class , df , dep , formula , start_params = logit_params , * * kwargs )
2022-10-13 17:23:29 +11:00
# -----------------------------
# Penalised logistic regression
class PenalisedLogit ( statsmodels . discrete . discrete_model . BinaryModel ) :
"""
2022-10-17 21:41:19 +11:00
statsmodel - compatible model for computing Firth penalised logistic regression
Uses the R * logistf * library .
2022-10-13 17:23:29 +11:00
2022-10-17 21:41:19 +11:00
This class should be used in conjunction with : func : ` yli . regress ` .
2022-10-13 17:23:29 +11:00
"""
2022-10-17 20:34:58 +11:00
def fit ( self , disp = False ) :
2022-10-13 17:23:29 +11:00
import rpy2 . robjects as ro
import rpy2 . robjects . packages
import rpy2 . robjects . pandas2ri
# Assume data is already cleaned from regress()
df = self . data . frame . copy ( )
# Convert bool to int otherwise rpy2 chokes
df = df . replace ( { False : 0 , True : 1 } )
# Import logistf
ro . packages . importr ( ' logistf ' )
with ro . conversion . localconverter ( ro . default_converter + ro . pandas2ri . converter ) :
with ro . local_context ( ) as lc :
# Convert DataFrame to R
lc [ ' df ' ] = df
# Transfer other parameters to R
lc [ ' formula_ ' ] = self . formula
2022-10-15 23:24:29 +11:00
lc [ ' alpha ' ] = config . alpha
2022-10-13 17:23:29 +11:00
# Fit the model
2022-10-15 23:24:29 +11:00
model = ro . r ( ' logistf(formula_, data=df, alpha=alpha) ' )
2022-10-13 17:23:29 +11:00
2022-10-14 21:57:22 +11:00
# TODO: Handle categorical terms?
terms = { t : SingleTerm ( t , Estimate ( b , ci0 , ci1 ) , p ) for t , b , ci0 , ci1 , p in zip ( model [ ' terms ' ] , model [ ' coefficients ' ] , model [ ' ci.lower ' ] , model [ ' ci.upper ' ] , model [ ' prob ' ] ) }
2022-10-13 17:23:29 +11:00
return RegressionResult (
model ,
' Penalised Logistic Regression ' , ' Logit ' , ' Penalised ML ' ,
2022-10-17 20:34:58 +11:00
self . endog_names , model [ ' n ' ] [ 0 ] , model [ ' df ' ] [ 0 ] , datetime . now ( ) , ' nonrobust ' ,
2022-10-14 21:57:22 +11:00
terms ,
2022-10-13 17:23:29 +11:00
model [ ' loglik ' ] [ 0 ] , model [ ' loglik ' ] [ 1 ] ,
None , None , None ,
None # Set exp in regress()
)