From 645cb7a85e7f718815aec8d0ce46535633bb75ac Mon Sep 17 00:00:00 2001 From: RunasSudo Date: Tue, 29 Nov 2022 23:00:14 +1100 Subject: [PATCH] Rename bootstrap_regress to regress_bootstrap and update Improve performance by directly calling statsmodels regression Report bootstrap in model summary Add to documentation --- docs/regress.rst | 2 + yli/__init__.py | 3 +- yli/bootstrap.py | 103 --------------------------------------------- yli/regress.py | 106 ++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 103 insertions(+), 111 deletions(-) delete mode 100644 yli/bootstrap.py diff --git a/docs/regress.rst b/docs/regress.rst index 2648c74..7c6553e 100644 --- a/docs/regress.rst +++ b/docs/regress.rst @@ -10,6 +10,8 @@ Functions .. autofunction:: yli.regress +.. autofunction:: yli.regress_bootstrap + .. autofunction:: yli.vif Result classes diff --git a/yli/__init__.py b/yli/__init__.py index e6aa3d3..73724dd 100644 --- a/yli/__init__.py +++ b/yli/__init__.py @@ -15,12 +15,11 @@ # along with this program. If not, see . from .bayes_factors import bayesfactor_afbf -from .bootstrap import bootstrap_regress from .config import config from .descriptives import auto_descriptives from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted -from .regress import PenalisedLogit, logit_then_regress, regress, vif +from .regress import PenalisedLogit, logit_then_regress, regress, regress_bootstrap, vif from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, ttest_ind def reload_me(): diff --git a/yli/bootstrap.py b/yli/bootstrap.py deleted file mode 100644 index 0f46936..0000000 --- a/yli/bootstrap.py +++ /dev/null @@ -1,103 +0,0 @@ -# scipy-yli: Helpful SciPy utilities and recipes -# Copyright © 2022 Lee Yingtong Li (RunasSudo) -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -import numpy as np -import pandas as pd -import patsy -from tqdm import tqdm - -from datetime import datetime - -from .regress import CategoricalTerm, RegressionResult, SingleTerm, regress -from .utils import Estimate, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term - -def bootstrap_regress( - model_class, df, dep, formula, *, - nan_policy='warn', - samples=1000, - **kwargs -): - """ - Fit a statsmodels regression model, using bootstrapping to compute confidence intervals and *p* values - - :param model_class: See :func:`yli.regress` - :param df: See :func:`yli.regress` - :param dep: See :func:`yli.regress` - :param formula: See :func:`yli.regress` - :param nan_policy: See :func:`yli.regress` - :param samples: Number of bootstrap samples to draw - :type samples: int - :param kwargs: See :func:`yli.regress` - - :rtype: :class:`yli.regress.RegressionResult` - """ - - # Check for/clean NaNs - # Following this, we pass nan_policy='raise' to assert no NaNs remaining - df = df[[dep] + cols_for_formula(formula, df)] - df = check_nan(df, nan_policy) - - # Ensure numeric type for dependent variable - if df[dep].dtype != 'float64': - df[dep] = df[dep].astype('float64') - - # Convert pandas nullable types for independent variables as this breaks statsmodels - df = convert_pandas_nullable(df) - - # Precompute design matrices - dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe') - - # Fit full model - full_model = regress(model_class, df, dep, formula, nan_policy='raise', _dmatrices=dmatrices, **kwargs) - - # Cache reference categories - ref_categories = {term_name: term.ref_category for term_name, term in full_model.terms.items() if isinstance(term, CategoricalTerm)} - - # Draw bootstrap samples and regress - bootstrap_results = [] - for i in tqdm(range(samples)): - #bootstrap_sample = df.sample(len(df), replace=True) - #bootstrap_results.append(regress(model_class, bootstrap_sample, dep, formula, nan_policy='raise', _dmatrices=dmatrices, _ref_categories=ref_categories, **kwargs)) - bootstrap_rows = pd.Series(dmatrices[0].index).sample(len(df), replace=True) - bootstrap_dmatrices = (dmatrices[0].loc[bootstrap_rows], dmatrices[1].loc[bootstrap_rows]) - bootstrap_results.append(regress(model_class, df, dep, formula, nan_policy='raise', _dmatrices=bootstrap_dmatrices, _ref_categories=ref_categories, **kwargs)) - - # Combine bootstrap results - terms = {} - for term_name, term in full_model.terms.items(): - if isinstance(term, SingleTerm): - bootstrap_betas = [r.terms[term_name].beta.point for r in bootstrap_results] - bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas) - bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue) - terms[term_name] = SingleTerm(term.raw_name, Estimate(term.beta.point, np.quantile(bootstrap_betas, 0.025), np.quantile(bootstrap_betas, 0.975)), bootstrap_pvalue) - else: - categories = {} - for sub_term_name, sub_term in term.categories.items(): - bootstrap_betas = [r.terms[term_name].categories[sub_term_name].beta.point for r in bootstrap_results if sub_term_name in r.terms[term_name].categories] - bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas) - bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue) - categories[sub_term_name] = SingleTerm(sub_term.raw_name, Estimate(sub_term.beta.point, np.quantile(bootstrap_betas, 0.025), np.quantile(bootstrap_betas, 0.975)), bootstrap_pvalue) - terms[term_name] = CategoricalTerm(categories, term.ref_category) - - return RegressionResult( - None, - full_model.full_name, full_model.model_name, full_model.fit_method, - dep, full_model.nobs, full_model.dof_model, datetime.now(), full_model.cov_type, - terms, - full_model.llf, full_model.llnull, - full_model.dof_resid, full_model.rsquared, full_model.f_statistic, - full_model.exp - ) diff --git a/yli/regress.py b/yli/regress.py index a97b3cd..1af1f11 100644 --- a/yli/regress.py +++ b/yli/regress.py @@ -22,6 +22,7 @@ import statsmodels import statsmodels.api as sm from statsmodels.iolib.table import SimpleTable from statsmodels.stats.outliers_influence import variance_inflation_factor +from tqdm import tqdm from datetime import datetime import itertools @@ -254,7 +255,7 @@ class RegressionResult: left_col.append(('Method:', self.fit_method)) left_col.append(('Date:', self.fitted_dt.strftime('%Y-%m-%d'))) left_col.append(('Time:', self.fitted_dt.strftime('%H:%M:%S'))) - left_col.append(('Std. Errors:', 'Non-Robust' if self.cov_type == 'nonrobust' else self.cov_type.upper())) + left_col.append(('Std. Errors:', 'Non-Robust' if self.cov_type == 'nonrobust' else self.cov_type.upper() if self.cov_type.startswith('hc') else self.cov_type)) # Right column right_col = [] @@ -440,7 +441,7 @@ def regress( family=None, # common model_kwargs cov_type=None, maxiter=None, start_params=None, # common fit_kwargs bool_baselevels=False, exp=None, - _dmatrices=None, _ref_categories=None, + _dmatrices=None, ): """ Fit a statsmodels regression model @@ -589,10 +590,7 @@ def regress( else: # Add a new categorical term if not exists if column not in terms: - if _ref_categories is None: - ref_category = formula_factor_ref_category(formula, df, factor) - else: - ref_category = _ref_categories[column] + ref_category = formula_factor_ref_category(formula, df, factor) terms[column] = CategoricalTerm({}, ref_category) terms[column].categories[contrast] = SingleTerm(raw_name, beta, pvalues[raw_name]) @@ -640,6 +638,102 @@ def regress( exp ) +def regress_bootstrap( + model_class, df, dep, formula, *, + nan_policy='warn', + model_kwargs=None, fit_kwargs=None, + samples=1000, + **kwargs +): + """ + Fit a statsmodels regression model, using bootstrapping to compute confidence intervals and *p* values + + :param model_class: See :func:`yli.regress` + :param df: See :func:`yli.regress` + :param dep: See :func:`yli.regress` + :param formula: See :func:`yli.regress` + :param nan_policy: See :func:`yli.regress` + :param model_kwargs: See :func:`yli.regress` + :param fit_kwargs: See :func:`yli.regress` + :param samples: Number of bootstrap samples to draw + :type samples: int + :param kwargs: Other arguments to pass to :func:`yli.regress` + + :rtype: :class:`yli.regress.RegressionResult` + """ + + if model_kwargs is None: + model_kwargs = {} + if fit_kwargs is None: + fit_kwargs = {} + + # Check for/clean NaNs + # Following this, we pass nan_policy='raise' to assert no NaNs remaining + df = df[[dep] + cols_for_formula(formula, df)] + df = check_nan(df, nan_policy) + + # Ensure numeric type for dependent variable + if df[dep].dtype != 'float64': + df[dep] = df[dep].astype('float64') + + # Convert pandas nullable types for independent variables as this breaks statsmodels + df = convert_pandas_nullable(df) + + # Precompute design matrices + dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe') + + # Fit full model + full_model = regress(model_class, df, dep, formula, nan_policy='raise', _dmatrices=dmatrices, model_kwargs=model_kwargs, fit_kwargs=fit_kwargs, **kwargs) + + # Initialise bootstrap_results + bootstrap_results = {} # Dict mapping term raw names to bootstrap betas + for term in full_model.terms.values(): + if isinstance(term, SingleTerm): + bootstrap_results[term.raw_name] = [] + else: + for sub_term in term.categories.values(): + bootstrap_results[sub_term.raw_name] = [] + + # Draw bootstrap samples and regress + dmatrices = dmatrices[0].join(dmatrices[1]) + + for i in tqdm(range(samples)): + bootstrap_rows = dmatrices.sample(len(df), replace=True) + model = model_class(endog=bootstrap_rows.iloc[:,0], exog=bootstrap_rows.iloc[:,1:], **model_kwargs) + model.formula = dep + ' ~ ' + formula + + result = model.fit(disp=False, **fit_kwargs) + + for raw_name, raw_beta in zip(model.exog_names, result._results.params): + bootstrap_results[raw_name].append(raw_beta) + + # Combine bootstrap results + terms = {} + for term_name, term in full_model.terms.items(): + if isinstance(term, SingleTerm): + bootstrap_betas = bootstrap_results[term.raw_name] + bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas) + bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue) + terms[term_name] = SingleTerm(term.raw_name, Estimate(term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue) + else: + categories = {} + for sub_term_name, sub_term in term.categories.items(): + bootstrap_betas = bootstrap_results[sub_term.raw_name] + bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas) + bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue) + categories[sub_term_name] = SingleTerm(sub_term.raw_name, Estimate(sub_term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue) + terms[term_name] = CategoricalTerm(categories, term.ref_category) + + return RegressionResult( + None, + full_model.full_name, full_model.model_name, full_model.fit_method, + dep, full_model.nobs, full_model.dof_model, datetime.now(), 'Bootstrap', + terms, + full_model.llf, full_model.llnull, + full_model.dof_resid, full_model.rsquared, full_model.f_statistic, + full_model.exp + ) + def logit_then_regress(model_class, df, dep, formula, *, nan_policy='warn', **kwargs): """ Perform logistic regression, then use parameters as start parameters for desired regression