Implement bootstrap_regress

This commit is contained in:
RunasSudo 2022-11-29 14:40:50 +11:00
parent e268f385be
commit e71a1aea12
Signed by: RunasSudo
GPG Key ID: 7234E476BF21C61A
3 changed files with 157 additions and 26 deletions

View File

@ -15,6 +15,7 @@
# along with this program. If not, see <https://www.gnu.org/licenses/>. # along with this program. If not, see <https://www.gnu.org/licenses/>.
from .bayes_factors import bayesfactor_afbf from .bayes_factors import bayesfactor_afbf
from .bootstrap import bootstrap_regress
from .config import config from .config import config
from .descriptives import auto_descriptives from .descriptives import auto_descriptives
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist

103
yli/bootstrap.py Normal file
View File

@ -0,0 +1,103 @@
# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import numpy as np
import pandas as pd
import patsy
from tqdm import tqdm
from datetime import datetime
from .regress import CategoricalTerm, RegressionResult, SingleTerm, regress
from .utils import Estimate, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term
def bootstrap_regress(
model_class, df, dep, formula, *,
nan_policy='warn',
samples=1000,
**kwargs
):
"""
Fit a statsmodels regression model, using bootstrapping to compute confidence intervals and *p* values
:param model_class: See :func:`yli.regress`
:param df: See :func:`yli.regress`
:param dep: See :func:`yli.regress`
:param formula: See :func:`yli.regress`
:param nan_policy: See :func:`yli.regress`
:param samples: Number of bootstrap samples to draw
:type samples: int
:param kwargs: See :func:`yli.regress`
:rtype: :class:`yli.regress.RegressionResult`
"""
# Check for/clean NaNs
# Following this, we pass nan_policy='raise' to assert no NaNs remaining
df = df[[dep] + cols_for_formula(formula, df)]
df = check_nan(df, nan_policy)
# Ensure numeric type for dependent variable
if df[dep].dtype != 'float64':
df[dep] = df[dep].astype('float64')
# Convert pandas nullable types for independent variables as this breaks statsmodels
df = convert_pandas_nullable(df)
# Precompute design matrices
dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe')
# Fit full model
full_model = regress(model_class, df, dep, formula, nan_policy='raise', _dmatrices=dmatrices, **kwargs)
# Cache reference categories
ref_categories = {term_name: term.ref_category for term_name, term in full_model.terms.items() if isinstance(term, CategoricalTerm)}
# Draw bootstrap samples and regress
bootstrap_results = []
for i in tqdm(range(samples)):
#bootstrap_sample = df.sample(len(df), replace=True)
#bootstrap_results.append(regress(model_class, bootstrap_sample, dep, formula, nan_policy='raise', _dmatrices=dmatrices, _ref_categories=ref_categories, **kwargs))
bootstrap_rows = pd.Series(dmatrices[0].index).sample(len(df), replace=True)
bootstrap_dmatrices = (dmatrices[0].loc[bootstrap_rows], dmatrices[1].loc[bootstrap_rows])
bootstrap_results.append(regress(model_class, df, dep, formula, nan_policy='raise', _dmatrices=bootstrap_dmatrices, _ref_categories=ref_categories, **kwargs))
# Combine bootstrap results
terms = {}
for term_name, term in full_model.terms.items():
if isinstance(term, SingleTerm):
bootstrap_betas = [r.terms[term_name].beta.point for r in bootstrap_results]
bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
terms[term_name] = SingleTerm(term.raw_name, Estimate(term.beta.point, np.quantile(bootstrap_betas, 0.025), np.quantile(bootstrap_betas, 0.975)), bootstrap_pvalue)
else:
categories = {}
for sub_term_name, sub_term in term.categories.items():
bootstrap_betas = [r.terms[term_name].categories[sub_term_name].beta.point for r in bootstrap_results if sub_term_name in r.terms[term_name].categories]
bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
categories[sub_term_name] = SingleTerm(sub_term.raw_name, Estimate(sub_term.beta.point, np.quantile(bootstrap_betas, 0.025), np.quantile(bootstrap_betas, 0.975)), bootstrap_pvalue)
terms[term_name] = CategoricalTerm(categories, term.ref_category)
return RegressionResult(
None,
full_model.full_name, full_model.model_name, full_model.fit_method,
dep, full_model.nobs, full_model.dof_model, datetime.now(), full_model.cov_type,
terms,
full_model.llf, full_model.llnull,
full_model.dof_resid, full_model.rsquared, full_model.f_statistic,
full_model.exp
)

View File

@ -16,6 +16,7 @@
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import patsy
from scipy import stats from scipy import stats
import statsmodels import statsmodels
import statsmodels.api as sm import statsmodels.api as sm
@ -438,7 +439,8 @@ def regress(
model_kwargs=None, fit_kwargs=None, model_kwargs=None, fit_kwargs=None,
family=None, # common model_kwargs family=None, # common model_kwargs
cov_type=None, maxiter=None, start_params=None, # common fit_kwargs cov_type=None, maxiter=None, start_params=None, # common fit_kwargs
bool_baselevels=False, exp=None bool_baselevels=False, exp=None,
_dmatrices=None, _ref_categories=None,
): ):
""" """
Fit a statsmodels regression model Fit a statsmodels regression model
@ -526,19 +528,26 @@ def regress(
else: else:
exp = False exp = False
# Check for/clean NaNs if _dmatrices is None:
df = df[[dep] + cols_for_formula(formula, df)] # Check for/clean NaNs
df = check_nan(df, nan_policy) df = df[[dep] + cols_for_formula(formula, df)]
df = check_nan(df, nan_policy)
# Ensure numeric type for dependent variable
if df[dep].dtype != 'float64': # Ensure numeric type for dependent variable
df[dep] = df[dep].astype('float64') if df[dep].dtype != 'float64':
df[dep] = df[dep].astype('float64')
# Convert pandas nullable types for independent variables as this breaks statsmodels
df = convert_pandas_nullable(df) # Convert pandas nullable types for independent variables as this breaks statsmodels
df = convert_pandas_nullable(df)
# Construct design matrix from formula
dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe')
else:
dmatrices = _dmatrices
# Fit model # Fit model
model = model_class.from_formula(formula=dep + ' ~ ' + formula, data=df, **model_kwargs) model = model_class(endog=dmatrices[0], exog=dmatrices[1], **model_kwargs)
model.formula = dep + ' ~ ' + formula
result = model.fit(disp=False, **fit_kwargs) result = model.fit(disp=False, **fit_kwargs)
if isinstance(result, RegressionResult): if isinstance(result, RegressionResult):
@ -553,16 +562,20 @@ def regress(
# Process terms # Process terms
terms = {} terms = {}
confint = result.conf_int(config.alpha) # Join term names manually because statsmodels wrapper is very slow
#confint = result.conf_int(config.alpha)
confint = {k: v for k, v in zip(model.exog_names, result._results.conf_int(config.alpha))}
pvalues = {k: v for k, v in zip(model.exog_names, result._results.pvalues)}
for raw_name, raw_beta in result.params.items(): #for raw_name, raw_beta in result.params.items():
beta = Estimate(raw_beta, confint[0][raw_name], confint[1][raw_name]) for raw_name, raw_beta in zip(model.exog_names, result._results.params):
beta = Estimate(raw_beta, confint[raw_name][0], confint[raw_name][1])
# Rename terms # Rename terms
if raw_name == 'Intercept': if raw_name == 'Intercept':
# Intercept term (single term) # Intercept term (single term)
term = '(Intercept)' term = '(Intercept)'
terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name]) terms[term] = SingleTerm(raw_name, beta, pvalues[raw_name])
else: else:
# Parse if required # Parse if required
factor, column, contrast = parse_patsy_term(formula, df, raw_name) factor, column, contrast = parse_patsy_term(formula, df, raw_name)
@ -572,28 +585,42 @@ def regress(
if bool_baselevels is False and contrast == 'True' and set(df[column].unique()) == set([True, False]): if bool_baselevels is False and contrast == 'True' and set(df[column].unique()) == set([True, False]):
# Treat as single term # Treat as single term
terms[column] = SingleTerm(raw_name, beta, result.pvalues[raw_name]) terms[column] = SingleTerm(raw_name, beta, pvalues[raw_name])
else: else:
# Add a new categorical term if not exists # Add a new categorical term if not exists
if column not in terms: if column not in terms:
ref_category = formula_factor_ref_category(formula, df, factor) if _ref_categories is None:
ref_category = formula_factor_ref_category(formula, df, factor)
else:
ref_category = _ref_categories[column]
terms[column] = CategoricalTerm({}, ref_category) terms[column] = CategoricalTerm({}, ref_category)
terms[column].categories[contrast] = SingleTerm(raw_name, beta, result.pvalues[raw_name]) terms[column].categories[contrast] = SingleTerm(raw_name, beta, pvalues[raw_name])
else: else:
# Single term # Single term
terms[column] = SingleTerm(raw_name, beta, result.pvalues[raw_name]) terms[column] = SingleTerm(raw_name, beta, pvalues[raw_name])
# Fit null model (for llnull) # Fit null model (for llnull)
if hasattr(result, 'llnull'): if hasattr(result, 'llnull'):
llnull = result.llnull llnull = result.llnull
else: else:
result_null = model_class.from_formula(formula=dep + ' ~ 1', data=df).fit() # Construct null (intercept-only) model
#result_null = model_class.from_formula(formula=dep + ' ~ 1', data=df).fit()
dm_exog = pd.DataFrame(index=dmatrices[0].index)
dm_exog['Intercept'] = pd.Series(dtype='float64')
dm_exog['Intercept'].fillna(1, inplace=True)
result_null = model_class(endog=dmatrices[0], exog=dm_exog).fit()
llnull = result_null.llf llnull = result_null.llf
# Parse raw regression results (to get fit method) if model_class is sm.OLS:
header_entries = np.vectorize(str.strip)(np.concatenate(np.split(np.array(result.summary().tables[0].data), 2, axis=1))) method_name = 'Least Squares'
header_dict = {x[0]: x[1] for x in header_entries} else:
# Parse raw regression results to get fit method
# Avoid this in general as it can be expensive to summarise all the post hoc tests, etc.
header_entries = np.vectorize(str.strip)(np.concatenate(np.split(np.array(result.summary().tables[0].data), 2, axis=1)))
header_dict = {x[0]: x[1] for x in header_entries}
method_name = header_dict['Method:']
# Get full name to display # Get full name to display
if model_class is sm.Logit: if model_class is sm.Logit:
@ -605,7 +632,7 @@ def regress(
return RegressionResult( return RegressionResult(
result, result,
full_name, model_class.__name__, header_dict['Method:'], full_name, model_class.__name__, method_name,
dep, result.nobs, result.df_model, datetime.now(), result.cov_type, dep, result.nobs, result.df_model, datetime.now(), result.cov_type,
terms, terms,
result.llf, llnull, result.llf, llnull,
@ -697,7 +724,7 @@ class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel):
import rpy2.robjects.pandas2ri import rpy2.robjects.pandas2ri
# Assume data is already cleaned from regress() # Assume data is already cleaned from regress()
df = self.data.frame.copy() df = self.data.orig_endog.join(self.data.orig_exog)
# Convert bool to int otherwise rpy2 chokes # Convert bool to int otherwise rpy2 chokes
df = df.replace({False: 0, True: 1}) df = df.replace({False: 0, True: 1})