# scipy-yli: Helpful SciPy utilities and recipes # Copyright © 2022 Lee Yingtong Li (RunasSudo) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import numpy as np import pandas as pd import patsy from tqdm import tqdm from datetime import datetime from .regress import CategoricalTerm, RegressionResult, SingleTerm, regress from .utils import Estimate, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term def bootstrap_regress( model_class, df, dep, formula, *, nan_policy='warn', samples=1000, **kwargs ): """ Fit a statsmodels regression model, using bootstrapping to compute confidence intervals and *p* values :param model_class: See :func:`yli.regress` :param df: See :func:`yli.regress` :param dep: See :func:`yli.regress` :param formula: See :func:`yli.regress` :param nan_policy: See :func:`yli.regress` :param samples: Number of bootstrap samples to draw :type samples: int :param kwargs: See :func:`yli.regress` :rtype: :class:`yli.regress.RegressionResult` """ # Check for/clean NaNs # Following this, we pass nan_policy='raise' to assert no NaNs remaining df = df[[dep] + cols_for_formula(formula, df)] df = check_nan(df, nan_policy) # Ensure numeric type for dependent variable if df[dep].dtype != 'float64': df[dep] = df[dep].astype('float64') # Convert pandas nullable types for independent variables as this breaks statsmodels df = convert_pandas_nullable(df) # Precompute design matrices dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe') # Fit full model full_model = regress(model_class, df, dep, formula, nan_policy='raise', _dmatrices=dmatrices, **kwargs) # Cache reference categories ref_categories = {term_name: term.ref_category for term_name, term in full_model.terms.items() if isinstance(term, CategoricalTerm)} # Draw bootstrap samples and regress bootstrap_results = [] for i in tqdm(range(samples)): #bootstrap_sample = df.sample(len(df), replace=True) #bootstrap_results.append(regress(model_class, bootstrap_sample, dep, formula, nan_policy='raise', _dmatrices=dmatrices, _ref_categories=ref_categories, **kwargs)) bootstrap_rows = pd.Series(dmatrices[0].index).sample(len(df), replace=True) bootstrap_dmatrices = (dmatrices[0].loc[bootstrap_rows], dmatrices[1].loc[bootstrap_rows]) bootstrap_results.append(regress(model_class, df, dep, formula, nan_policy='raise', _dmatrices=bootstrap_dmatrices, _ref_categories=ref_categories, **kwargs)) # Combine bootstrap results terms = {} for term_name, term in full_model.terms.items(): if isinstance(term, SingleTerm): bootstrap_betas = [r.terms[term_name].beta.point for r in bootstrap_results] bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas) bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue) terms[term_name] = SingleTerm(term.raw_name, Estimate(term.beta.point, np.quantile(bootstrap_betas, 0.025), np.quantile(bootstrap_betas, 0.975)), bootstrap_pvalue) else: categories = {} for sub_term_name, sub_term in term.categories.items(): bootstrap_betas = [r.terms[term_name].categories[sub_term_name].beta.point for r in bootstrap_results if sub_term_name in r.terms[term_name].categories] bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas) bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue) categories[sub_term_name] = SingleTerm(sub_term.raw_name, Estimate(sub_term.beta.point, np.quantile(bootstrap_betas, 0.025), np.quantile(bootstrap_betas, 0.975)), bootstrap_pvalue) terms[term_name] = CategoricalTerm(categories, term.ref_category) return RegressionResult( None, full_model.full_name, full_model.model_name, full_model.fit_method, dep, full_model.nobs, full_model.dof_model, datetime.now(), full_model.cov_type, terms, full_model.llf, full_model.llnull, full_model.dof_resid, full_model.rsquared, full_model.f_statistic, full_model.exp )