From e71a1aea122f004d6f1b35907d50feebfc8bd3ff Mon Sep 17 00:00:00 2001 From: RunasSudo Date: Tue, 29 Nov 2022 14:40:50 +1100 Subject: [PATCH] Implement bootstrap_regress --- yli/__init__.py | 1 + yli/bootstrap.py | 103 +++++++++++++++++++++++++++++++++++++++++++++++ yli/regress.py | 79 ++++++++++++++++++++++++------------ 3 files changed, 157 insertions(+), 26 deletions(-) create mode 100644 yli/bootstrap.py diff --git a/yli/__init__.py b/yli/__init__.py index 476f901..e6aa3d3 100644 --- a/yli/__init__.py +++ b/yli/__init__.py @@ -15,6 +15,7 @@ # along with this program. If not, see . from .bayes_factors import bayesfactor_afbf +from .bootstrap import bootstrap_regress from .config import config from .descriptives import auto_descriptives from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist diff --git a/yli/bootstrap.py b/yli/bootstrap.py new file mode 100644 index 0000000..0f46936 --- /dev/null +++ b/yli/bootstrap.py @@ -0,0 +1,103 @@ +# scipy-yli: Helpful SciPy utilities and recipes +# Copyright © 2022 Lee Yingtong Li (RunasSudo) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import numpy as np +import pandas as pd +import patsy +from tqdm import tqdm + +from datetime import datetime + +from .regress import CategoricalTerm, RegressionResult, SingleTerm, regress +from .utils import Estimate, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term + +def bootstrap_regress( + model_class, df, dep, formula, *, + nan_policy='warn', + samples=1000, + **kwargs +): + """ + Fit a statsmodels regression model, using bootstrapping to compute confidence intervals and *p* values + + :param model_class: See :func:`yli.regress` + :param df: See :func:`yli.regress` + :param dep: See :func:`yli.regress` + :param formula: See :func:`yli.regress` + :param nan_policy: See :func:`yli.regress` + :param samples: Number of bootstrap samples to draw + :type samples: int + :param kwargs: See :func:`yli.regress` + + :rtype: :class:`yli.regress.RegressionResult` + """ + + # Check for/clean NaNs + # Following this, we pass nan_policy='raise' to assert no NaNs remaining + df = df[[dep] + cols_for_formula(formula, df)] + df = check_nan(df, nan_policy) + + # Ensure numeric type for dependent variable + if df[dep].dtype != 'float64': + df[dep] = df[dep].astype('float64') + + # Convert pandas nullable types for independent variables as this breaks statsmodels + df = convert_pandas_nullable(df) + + # Precompute design matrices + dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe') + + # Fit full model + full_model = regress(model_class, df, dep, formula, nan_policy='raise', _dmatrices=dmatrices, **kwargs) + + # Cache reference categories + ref_categories = {term_name: term.ref_category for term_name, term in full_model.terms.items() if isinstance(term, CategoricalTerm)} + + # Draw bootstrap samples and regress + bootstrap_results = [] + for i in tqdm(range(samples)): + #bootstrap_sample = df.sample(len(df), replace=True) + #bootstrap_results.append(regress(model_class, bootstrap_sample, dep, formula, nan_policy='raise', _dmatrices=dmatrices, _ref_categories=ref_categories, **kwargs)) + bootstrap_rows = pd.Series(dmatrices[0].index).sample(len(df), replace=True) + bootstrap_dmatrices = (dmatrices[0].loc[bootstrap_rows], dmatrices[1].loc[bootstrap_rows]) + bootstrap_results.append(regress(model_class, df, dep, formula, nan_policy='raise', _dmatrices=bootstrap_dmatrices, _ref_categories=ref_categories, **kwargs)) + + # Combine bootstrap results + terms = {} + for term_name, term in full_model.terms.items(): + if isinstance(term, SingleTerm): + bootstrap_betas = [r.terms[term_name].beta.point for r in bootstrap_results] + bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas) + bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue) + terms[term_name] = SingleTerm(term.raw_name, Estimate(term.beta.point, np.quantile(bootstrap_betas, 0.025), np.quantile(bootstrap_betas, 0.975)), bootstrap_pvalue) + else: + categories = {} + for sub_term_name, sub_term in term.categories.items(): + bootstrap_betas = [r.terms[term_name].categories[sub_term_name].beta.point for r in bootstrap_results if sub_term_name in r.terms[term_name].categories] + bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas) + bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue) + categories[sub_term_name] = SingleTerm(sub_term.raw_name, Estimate(sub_term.beta.point, np.quantile(bootstrap_betas, 0.025), np.quantile(bootstrap_betas, 0.975)), bootstrap_pvalue) + terms[term_name] = CategoricalTerm(categories, term.ref_category) + + return RegressionResult( + None, + full_model.full_name, full_model.model_name, full_model.fit_method, + dep, full_model.nobs, full_model.dof_model, datetime.now(), full_model.cov_type, + terms, + full_model.llf, full_model.llnull, + full_model.dof_resid, full_model.rsquared, full_model.f_statistic, + full_model.exp + ) diff --git a/yli/regress.py b/yli/regress.py index ad4ac83..a97b3cd 100644 --- a/yli/regress.py +++ b/yli/regress.py @@ -16,6 +16,7 @@ import numpy as np import pandas as pd +import patsy from scipy import stats import statsmodels import statsmodels.api as sm @@ -438,7 +439,8 @@ def regress( model_kwargs=None, fit_kwargs=None, family=None, # common model_kwargs cov_type=None, maxiter=None, start_params=None, # common fit_kwargs - bool_baselevels=False, exp=None + bool_baselevels=False, exp=None, + _dmatrices=None, _ref_categories=None, ): """ Fit a statsmodels regression model @@ -526,19 +528,26 @@ def regress( else: exp = False - # Check for/clean NaNs - df = df[[dep] + cols_for_formula(formula, df)] - df = check_nan(df, nan_policy) - - # Ensure numeric type for dependent variable - if df[dep].dtype != 'float64': - df[dep] = df[dep].astype('float64') - - # Convert pandas nullable types for independent variables as this breaks statsmodels - df = convert_pandas_nullable(df) + if _dmatrices is None: + # Check for/clean NaNs + df = df[[dep] + cols_for_formula(formula, df)] + df = check_nan(df, nan_policy) + + # Ensure numeric type for dependent variable + if df[dep].dtype != 'float64': + df[dep] = df[dep].astype('float64') + + # Convert pandas nullable types for independent variables as this breaks statsmodels + df = convert_pandas_nullable(df) + + # Construct design matrix from formula + dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe') + else: + dmatrices = _dmatrices # Fit model - model = model_class.from_formula(formula=dep + ' ~ ' + formula, data=df, **model_kwargs) + model = model_class(endog=dmatrices[0], exog=dmatrices[1], **model_kwargs) + model.formula = dep + ' ~ ' + formula result = model.fit(disp=False, **fit_kwargs) if isinstance(result, RegressionResult): @@ -553,16 +562,20 @@ def regress( # Process terms terms = {} - confint = result.conf_int(config.alpha) + # Join term names manually because statsmodels wrapper is very slow + #confint = result.conf_int(config.alpha) + confint = {k: v for k, v in zip(model.exog_names, result._results.conf_int(config.alpha))} + pvalues = {k: v for k, v in zip(model.exog_names, result._results.pvalues)} - for raw_name, raw_beta in result.params.items(): - beta = Estimate(raw_beta, confint[0][raw_name], confint[1][raw_name]) + #for raw_name, raw_beta in result.params.items(): + for raw_name, raw_beta in zip(model.exog_names, result._results.params): + beta = Estimate(raw_beta, confint[raw_name][0], confint[raw_name][1]) # Rename terms if raw_name == 'Intercept': # Intercept term (single term) term = '(Intercept)' - terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name]) + terms[term] = SingleTerm(raw_name, beta, pvalues[raw_name]) else: # Parse if required factor, column, contrast = parse_patsy_term(formula, df, raw_name) @@ -572,28 +585,42 @@ def regress( if bool_baselevels is False and contrast == 'True' and set(df[column].unique()) == set([True, False]): # Treat as single term - terms[column] = SingleTerm(raw_name, beta, result.pvalues[raw_name]) + terms[column] = SingleTerm(raw_name, beta, pvalues[raw_name]) else: # Add a new categorical term if not exists if column not in terms: - ref_category = formula_factor_ref_category(formula, df, factor) + if _ref_categories is None: + ref_category = formula_factor_ref_category(formula, df, factor) + else: + ref_category = _ref_categories[column] terms[column] = CategoricalTerm({}, ref_category) - terms[column].categories[contrast] = SingleTerm(raw_name, beta, result.pvalues[raw_name]) + terms[column].categories[contrast] = SingleTerm(raw_name, beta, pvalues[raw_name]) else: # Single term - terms[column] = SingleTerm(raw_name, beta, result.pvalues[raw_name]) + terms[column] = SingleTerm(raw_name, beta, pvalues[raw_name]) # Fit null model (for llnull) if hasattr(result, 'llnull'): llnull = result.llnull else: - result_null = model_class.from_formula(formula=dep + ' ~ 1', data=df).fit() + # Construct null (intercept-only) model + #result_null = model_class.from_formula(formula=dep + ' ~ 1', data=df).fit() + dm_exog = pd.DataFrame(index=dmatrices[0].index) + dm_exog['Intercept'] = pd.Series(dtype='float64') + dm_exog['Intercept'].fillna(1, inplace=True) + + result_null = model_class(endog=dmatrices[0], exog=dm_exog).fit() llnull = result_null.llf - # Parse raw regression results (to get fit method) - header_entries = np.vectorize(str.strip)(np.concatenate(np.split(np.array(result.summary().tables[0].data), 2, axis=1))) - header_dict = {x[0]: x[1] for x in header_entries} + if model_class is sm.OLS: + method_name = 'Least Squares' + else: + # Parse raw regression results to get fit method + # Avoid this in general as it can be expensive to summarise all the post hoc tests, etc. + header_entries = np.vectorize(str.strip)(np.concatenate(np.split(np.array(result.summary().tables[0].data), 2, axis=1))) + header_dict = {x[0]: x[1] for x in header_entries} + method_name = header_dict['Method:'] # Get full name to display if model_class is sm.Logit: @@ -605,7 +632,7 @@ def regress( return RegressionResult( result, - full_name, model_class.__name__, header_dict['Method:'], + full_name, model_class.__name__, method_name, dep, result.nobs, result.df_model, datetime.now(), result.cov_type, terms, result.llf, llnull, @@ -697,7 +724,7 @@ class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel): import rpy2.robjects.pandas2ri # Assume data is already cleaned from regress() - df = self.data.frame.copy() + df = self.data.orig_endog.join(self.data.orig_exog) # Convert bool to int otherwise rpy2 chokes df = df.replace({False: 0, True: 1})