From 57e472ca09a3ee7e7e0e19ab353f258af4e4d843 Mon Sep 17 00:00:00 2001 From: RunasSudo Date: Tue, 28 Jan 2025 19:31:40 +1100 Subject: [PATCH] Re-implement RegressionModel.bootstrap --- README.md | 1 + yli/regress.py | 54 +++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 78757fd..fd6eb93 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,7 @@ Optional dependencies are: * [BFpack](https://cran.r-project.org/web/packages/BFpack/index.html), for *bayesfactor_afbf* (*RegressionModel.bayesfactor_beta_zero*) * [logistf](https://cran.r-project.org/web/packages/logistf/index.html), for *PenalisedLogit* * [shap](https://shap.readthedocs.io/en/latest/), for *RegressionModel.shap* +* [tqdm](https://tqdm.github.io/), for *RegressionModel.bootstrap* ## Functions diff --git a/yli/regress.py b/yli/regress.py index 6a15376..9290335 100644 --- a/yli/regress.py +++ b/yli/regress.py @@ -38,7 +38,6 @@ from .sig_tests import ChiSquaredResult, FTestResult from .utils import Estimate, PValueStyle, as_numeric, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term # TODO: Documentation -# TODO: Bootstrap def vif(df, formula=None, *, nan_policy='warn'): """ @@ -193,6 +192,7 @@ def regress( result.dep = dep result.formula = formula result.nan_policy = nan_policy + result.fit_kwargs = fit_kwargs if exp is not None: result.exp = exp result.fitted_dt = datetime.now() @@ -275,6 +275,7 @@ class RegressionModel: self.dep = None self.formula = None self.nan_policy = None + self.fit_kwargs = None self.exp = False self.cov_type = None @@ -585,6 +586,57 @@ class RegressionModel: return 1 - (self.ll_model - self.dof_model) / self.ll_null + def bootstrap(self, samples=1000): + """ + Use bootstrapping to recompute confidence intervals and *p* values for the terms in the regression model + + Mutates the current RegressionModel instance. + + :param samples: Number of bootstrap samples to draw + :type samples: int + + :rtype: :class:`yli.regress.RegressionModel` + """ + + from tqdm import tqdm + + df = self.df() + if df is None: + raise Exception('Referenced DataFrame has been dropped') + + # Preprocess data, check for NaN and get design matrices + df_clean, dmatrices, dep_categories = df_to_dmatrices(df, self.dep, self.formula, self.nan_policy, []) + + # Initialise bootstrap_results + bootstrap_results = {} # Dict mapping term raw names to bootstrap betas + for term in self.terms_flat(): + bootstrap_results[term.raw_name] = [] + + # Draw bootstrap samples and regress + dmatrices = dmatrices[0].join(dmatrices[1]) + + for i in tqdm(range(samples)): + bootstrap_rows = dmatrices.sample(len(df), replace=True) + + # Fit model + result = self.__class__.fit(bootstrap_rows.iloc[:,0], bootstrap_rows.iloc[:,1:], **self.fit_kwargs) + + for term in result.terms_flat(): + bootstrap_results[term.raw_name].append(term.beta.point) + + # Combine bootstrap results + for term in self.terms_flat(): + bootstrap_betas = bootstrap_results[term.raw_name] + bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas) + bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue) + + term.beta = Estimate(term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)) + term.pvalue = bootstrap_pvalue + + self.cov_type = 'Bootstrap' + + return self + def shap(self, **kwargs): """ Compute SHAP values for the model