Implement bootstrap_regress
This commit is contained in:
parent
e268f385be
commit
e71a1aea12
@ -15,6 +15,7 @@
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
from .bayes_factors import bayesfactor_afbf
|
||||
from .bootstrap import bootstrap_regress
|
||||
from .config import config
|
||||
from .descriptives import auto_descriptives
|
||||
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
|
||||
|
103
yli/bootstrap.py
Normal file
103
yli/bootstrap.py
Normal file
@ -0,0 +1,103 @@
|
||||
# scipy-yli: Helpful SciPy utilities and recipes
|
||||
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import patsy
|
||||
from tqdm import tqdm
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from .regress import CategoricalTerm, RegressionResult, SingleTerm, regress
|
||||
from .utils import Estimate, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term
|
||||
|
||||
def bootstrap_regress(
|
||||
model_class, df, dep, formula, *,
|
||||
nan_policy='warn',
|
||||
samples=1000,
|
||||
**kwargs
|
||||
):
|
||||
"""
|
||||
Fit a statsmodels regression model, using bootstrapping to compute confidence intervals and *p* values
|
||||
|
||||
:param model_class: See :func:`yli.regress`
|
||||
:param df: See :func:`yli.regress`
|
||||
:param dep: See :func:`yli.regress`
|
||||
:param formula: See :func:`yli.regress`
|
||||
:param nan_policy: See :func:`yli.regress`
|
||||
:param samples: Number of bootstrap samples to draw
|
||||
:type samples: int
|
||||
:param kwargs: See :func:`yli.regress`
|
||||
|
||||
:rtype: :class:`yli.regress.RegressionResult`
|
||||
"""
|
||||
|
||||
# Check for/clean NaNs
|
||||
# Following this, we pass nan_policy='raise' to assert no NaNs remaining
|
||||
df = df[[dep] + cols_for_formula(formula, df)]
|
||||
df = check_nan(df, nan_policy)
|
||||
|
||||
# Ensure numeric type for dependent variable
|
||||
if df[dep].dtype != 'float64':
|
||||
df[dep] = df[dep].astype('float64')
|
||||
|
||||
# Convert pandas nullable types for independent variables as this breaks statsmodels
|
||||
df = convert_pandas_nullable(df)
|
||||
|
||||
# Precompute design matrices
|
||||
dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe')
|
||||
|
||||
# Fit full model
|
||||
full_model = regress(model_class, df, dep, formula, nan_policy='raise', _dmatrices=dmatrices, **kwargs)
|
||||
|
||||
# Cache reference categories
|
||||
ref_categories = {term_name: term.ref_category for term_name, term in full_model.terms.items() if isinstance(term, CategoricalTerm)}
|
||||
|
||||
# Draw bootstrap samples and regress
|
||||
bootstrap_results = []
|
||||
for i in tqdm(range(samples)):
|
||||
#bootstrap_sample = df.sample(len(df), replace=True)
|
||||
#bootstrap_results.append(regress(model_class, bootstrap_sample, dep, formula, nan_policy='raise', _dmatrices=dmatrices, _ref_categories=ref_categories, **kwargs))
|
||||
bootstrap_rows = pd.Series(dmatrices[0].index).sample(len(df), replace=True)
|
||||
bootstrap_dmatrices = (dmatrices[0].loc[bootstrap_rows], dmatrices[1].loc[bootstrap_rows])
|
||||
bootstrap_results.append(regress(model_class, df, dep, formula, nan_policy='raise', _dmatrices=bootstrap_dmatrices, _ref_categories=ref_categories, **kwargs))
|
||||
|
||||
# Combine bootstrap results
|
||||
terms = {}
|
||||
for term_name, term in full_model.terms.items():
|
||||
if isinstance(term, SingleTerm):
|
||||
bootstrap_betas = [r.terms[term_name].beta.point for r in bootstrap_results]
|
||||
bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
|
||||
bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
|
||||
terms[term_name] = SingleTerm(term.raw_name, Estimate(term.beta.point, np.quantile(bootstrap_betas, 0.025), np.quantile(bootstrap_betas, 0.975)), bootstrap_pvalue)
|
||||
else:
|
||||
categories = {}
|
||||
for sub_term_name, sub_term in term.categories.items():
|
||||
bootstrap_betas = [r.terms[term_name].categories[sub_term_name].beta.point for r in bootstrap_results if sub_term_name in r.terms[term_name].categories]
|
||||
bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
|
||||
bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
|
||||
categories[sub_term_name] = SingleTerm(sub_term.raw_name, Estimate(sub_term.beta.point, np.quantile(bootstrap_betas, 0.025), np.quantile(bootstrap_betas, 0.975)), bootstrap_pvalue)
|
||||
terms[term_name] = CategoricalTerm(categories, term.ref_category)
|
||||
|
||||
return RegressionResult(
|
||||
None,
|
||||
full_model.full_name, full_model.model_name, full_model.fit_method,
|
||||
dep, full_model.nobs, full_model.dof_model, datetime.now(), full_model.cov_type,
|
||||
terms,
|
||||
full_model.llf, full_model.llnull,
|
||||
full_model.dof_resid, full_model.rsquared, full_model.f_statistic,
|
||||
full_model.exp
|
||||
)
|
@ -16,6 +16,7 @@
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import patsy
|
||||
from scipy import stats
|
||||
import statsmodels
|
||||
import statsmodels.api as sm
|
||||
@ -438,7 +439,8 @@ def regress(
|
||||
model_kwargs=None, fit_kwargs=None,
|
||||
family=None, # common model_kwargs
|
||||
cov_type=None, maxiter=None, start_params=None, # common fit_kwargs
|
||||
bool_baselevels=False, exp=None
|
||||
bool_baselevels=False, exp=None,
|
||||
_dmatrices=None, _ref_categories=None,
|
||||
):
|
||||
"""
|
||||
Fit a statsmodels regression model
|
||||
@ -526,19 +528,26 @@ def regress(
|
||||
else:
|
||||
exp = False
|
||||
|
||||
# Check for/clean NaNs
|
||||
df = df[[dep] + cols_for_formula(formula, df)]
|
||||
df = check_nan(df, nan_policy)
|
||||
if _dmatrices is None:
|
||||
# Check for/clean NaNs
|
||||
df = df[[dep] + cols_for_formula(formula, df)]
|
||||
df = check_nan(df, nan_policy)
|
||||
|
||||
# Ensure numeric type for dependent variable
|
||||
if df[dep].dtype != 'float64':
|
||||
df[dep] = df[dep].astype('float64')
|
||||
# Ensure numeric type for dependent variable
|
||||
if df[dep].dtype != 'float64':
|
||||
df[dep] = df[dep].astype('float64')
|
||||
|
||||
# Convert pandas nullable types for independent variables as this breaks statsmodels
|
||||
df = convert_pandas_nullable(df)
|
||||
# Convert pandas nullable types for independent variables as this breaks statsmodels
|
||||
df = convert_pandas_nullable(df)
|
||||
|
||||
# Construct design matrix from formula
|
||||
dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe')
|
||||
else:
|
||||
dmatrices = _dmatrices
|
||||
|
||||
# Fit model
|
||||
model = model_class.from_formula(formula=dep + ' ~ ' + formula, data=df, **model_kwargs)
|
||||
model = model_class(endog=dmatrices[0], exog=dmatrices[1], **model_kwargs)
|
||||
model.formula = dep + ' ~ ' + formula
|
||||
result = model.fit(disp=False, **fit_kwargs)
|
||||
|
||||
if isinstance(result, RegressionResult):
|
||||
@ -553,16 +562,20 @@ def regress(
|
||||
# Process terms
|
||||
terms = {}
|
||||
|
||||
confint = result.conf_int(config.alpha)
|
||||
# Join term names manually because statsmodels wrapper is very slow
|
||||
#confint = result.conf_int(config.alpha)
|
||||
confint = {k: v for k, v in zip(model.exog_names, result._results.conf_int(config.alpha))}
|
||||
pvalues = {k: v for k, v in zip(model.exog_names, result._results.pvalues)}
|
||||
|
||||
for raw_name, raw_beta in result.params.items():
|
||||
beta = Estimate(raw_beta, confint[0][raw_name], confint[1][raw_name])
|
||||
#for raw_name, raw_beta in result.params.items():
|
||||
for raw_name, raw_beta in zip(model.exog_names, result._results.params):
|
||||
beta = Estimate(raw_beta, confint[raw_name][0], confint[raw_name][1])
|
||||
|
||||
# Rename terms
|
||||
if raw_name == 'Intercept':
|
||||
# Intercept term (single term)
|
||||
term = '(Intercept)'
|
||||
terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
|
||||
terms[term] = SingleTerm(raw_name, beta, pvalues[raw_name])
|
||||
else:
|
||||
# Parse if required
|
||||
factor, column, contrast = parse_patsy_term(formula, df, raw_name)
|
||||
@ -572,28 +585,42 @@ def regress(
|
||||
|
||||
if bool_baselevels is False and contrast == 'True' and set(df[column].unique()) == set([True, False]):
|
||||
# Treat as single term
|
||||
terms[column] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
|
||||
terms[column] = SingleTerm(raw_name, beta, pvalues[raw_name])
|
||||
else:
|
||||
# Add a new categorical term if not exists
|
||||
if column not in terms:
|
||||
ref_category = formula_factor_ref_category(formula, df, factor)
|
||||
if _ref_categories is None:
|
||||
ref_category = formula_factor_ref_category(formula, df, factor)
|
||||
else:
|
||||
ref_category = _ref_categories[column]
|
||||
terms[column] = CategoricalTerm({}, ref_category)
|
||||
|
||||
terms[column].categories[contrast] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
|
||||
terms[column].categories[contrast] = SingleTerm(raw_name, beta, pvalues[raw_name])
|
||||
else:
|
||||
# Single term
|
||||
terms[column] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
|
||||
terms[column] = SingleTerm(raw_name, beta, pvalues[raw_name])
|
||||
|
||||
# Fit null model (for llnull)
|
||||
if hasattr(result, 'llnull'):
|
||||
llnull = result.llnull
|
||||
else:
|
||||
result_null = model_class.from_formula(formula=dep + ' ~ 1', data=df).fit()
|
||||
# Construct null (intercept-only) model
|
||||
#result_null = model_class.from_formula(formula=dep + ' ~ 1', data=df).fit()
|
||||
dm_exog = pd.DataFrame(index=dmatrices[0].index)
|
||||
dm_exog['Intercept'] = pd.Series(dtype='float64')
|
||||
dm_exog['Intercept'].fillna(1, inplace=True)
|
||||
|
||||
result_null = model_class(endog=dmatrices[0], exog=dm_exog).fit()
|
||||
llnull = result_null.llf
|
||||
|
||||
# Parse raw regression results (to get fit method)
|
||||
header_entries = np.vectorize(str.strip)(np.concatenate(np.split(np.array(result.summary().tables[0].data), 2, axis=1)))
|
||||
header_dict = {x[0]: x[1] for x in header_entries}
|
||||
if model_class is sm.OLS:
|
||||
method_name = 'Least Squares'
|
||||
else:
|
||||
# Parse raw regression results to get fit method
|
||||
# Avoid this in general as it can be expensive to summarise all the post hoc tests, etc.
|
||||
header_entries = np.vectorize(str.strip)(np.concatenate(np.split(np.array(result.summary().tables[0].data), 2, axis=1)))
|
||||
header_dict = {x[0]: x[1] for x in header_entries}
|
||||
method_name = header_dict['Method:']
|
||||
|
||||
# Get full name to display
|
||||
if model_class is sm.Logit:
|
||||
@ -605,7 +632,7 @@ def regress(
|
||||
|
||||
return RegressionResult(
|
||||
result,
|
||||
full_name, model_class.__name__, header_dict['Method:'],
|
||||
full_name, model_class.__name__, method_name,
|
||||
dep, result.nobs, result.df_model, datetime.now(), result.cov_type,
|
||||
terms,
|
||||
result.llf, llnull,
|
||||
@ -697,7 +724,7 @@ class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel):
|
||||
import rpy2.robjects.pandas2ri
|
||||
|
||||
# Assume data is already cleaned from regress()
|
||||
df = self.data.frame.copy()
|
||||
df = self.data.orig_endog.join(self.data.orig_exog)
|
||||
|
||||
# Convert bool to int otherwise rpy2 chokes
|
||||
df = df.replace({False: 0, True: 1})
|
||||
|
Loading…
Reference in New Issue
Block a user