From 645cb7a85e7f718815aec8d0ce46535633bb75ac Mon Sep 17 00:00:00 2001
From: RunasSudo <runassudo@yingtongli.me>
Date: Tue, 29 Nov 2022 23:00:14 +1100
Subject: [PATCH] Rename bootstrap_regress to regress_bootstrap and update

Improve performance by directly calling statsmodels regression
Report bootstrap in model summary
Add to documentation
---
 docs/regress.rst |   2 +
 yli/__init__.py  |   3 +-
 yli/bootstrap.py | 103 ---------------------------------------------
 yli/regress.py   | 106 ++++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 103 insertions(+), 111 deletions(-)
 delete mode 100644 yli/bootstrap.py

diff --git a/docs/regress.rst b/docs/regress.rst
index 2648c74..7c6553e 100644
--- a/docs/regress.rst
+++ b/docs/regress.rst
@@ -10,6 +10,8 @@ Functions
 
 .. autofunction:: yli.regress
 
+.. autofunction:: yli.regress_bootstrap
+
 .. autofunction:: yli.vif
 
 Result classes
diff --git a/yli/__init__.py b/yli/__init__.py
index e6aa3d3..73724dd 100644
--- a/yli/__init__.py
+++ b/yli/__init__.py
@@ -15,12 +15,11 @@
 #   along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 from .bayes_factors import bayesfactor_afbf
-from .bootstrap import bootstrap_regress
 from .config import config
 from .descriptives import auto_descriptives
 from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
 from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
-from .regress import PenalisedLogit, logit_then_regress, regress, vif
+from .regress import PenalisedLogit, logit_then_regress, regress, regress_bootstrap, vif
 from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, ttest_ind
 
 def reload_me():
diff --git a/yli/bootstrap.py b/yli/bootstrap.py
deleted file mode 100644
index 0f46936..0000000
--- a/yli/bootstrap.py
+++ /dev/null
@@ -1,103 +0,0 @@
-#   scipy-yli: Helpful SciPy utilities and recipes
-#   Copyright © 2022  Lee Yingtong Li (RunasSudo)
-#
-#   This program is free software: you can redistribute it and/or modify
-#   it under the terms of the GNU Affero General Public License as published by
-#   the Free Software Foundation, either version 3 of the License, or
-#   (at your option) any later version.
-#
-#   This program is distributed in the hope that it will be useful,
-#   but WITHOUT ANY WARRANTY; without even the implied warranty of
-#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#   GNU Affero General Public License for more details.
-#
-#   You should have received a copy of the GNU Affero General Public License
-#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
-
-import numpy as np
-import pandas as pd
-import patsy
-from tqdm import tqdm
-
-from datetime import datetime
-
-from .regress import CategoricalTerm, RegressionResult, SingleTerm, regress
-from .utils import Estimate, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term
-
-def bootstrap_regress(
-	model_class, df, dep, formula, *,
-	nan_policy='warn',
-	samples=1000,
-	**kwargs
-):
-	"""
-	Fit a statsmodels regression model, using bootstrapping to compute confidence intervals and *p* values
-	
-	:param model_class: See :func:`yli.regress`
-	:param df: See :func:`yli.regress`
-	:param dep: See :func:`yli.regress`
-	:param formula: See :func:`yli.regress`
-	:param nan_policy: See :func:`yli.regress`
-	:param samples: Number of bootstrap samples to draw
-	:type samples: int
-	:param kwargs: See :func:`yli.regress`
-	
-	:rtype: :class:`yli.regress.RegressionResult`
-	"""
-	
-	# Check for/clean NaNs
-	# Following this, we pass nan_policy='raise' to assert no NaNs remaining
-	df = df[[dep] + cols_for_formula(formula, df)]
-	df = check_nan(df, nan_policy)
-	
-	# Ensure numeric type for dependent variable
-	if df[dep].dtype != 'float64':
-		df[dep] = df[dep].astype('float64')
-	
-	# Convert pandas nullable types for independent variables as this breaks statsmodels
-	df = convert_pandas_nullable(df)
-	
-	# Precompute design matrices
-	dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe')
-	
-	# Fit full model
-	full_model = regress(model_class, df, dep, formula, nan_policy='raise', _dmatrices=dmatrices, **kwargs)
-	
-	# Cache reference categories
-	ref_categories = {term_name: term.ref_category for term_name, term in full_model.terms.items() if isinstance(term, CategoricalTerm)}
-	
-	# Draw bootstrap samples and regress
-	bootstrap_results = []
-	for i in tqdm(range(samples)):
-		#bootstrap_sample = df.sample(len(df), replace=True)
-		#bootstrap_results.append(regress(model_class, bootstrap_sample, dep, formula, nan_policy='raise', _dmatrices=dmatrices, _ref_categories=ref_categories, **kwargs))
-		bootstrap_rows = pd.Series(dmatrices[0].index).sample(len(df), replace=True)
-		bootstrap_dmatrices = (dmatrices[0].loc[bootstrap_rows], dmatrices[1].loc[bootstrap_rows])
-		bootstrap_results.append(regress(model_class, df, dep, formula, nan_policy='raise', _dmatrices=bootstrap_dmatrices, _ref_categories=ref_categories, **kwargs))
-	
-	# Combine bootstrap results
-	terms = {}
-	for term_name, term in full_model.terms.items():
-		if isinstance(term, SingleTerm):
-			bootstrap_betas = [r.terms[term_name].beta.point for r in bootstrap_results]
-			bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
-			bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
-			terms[term_name] = SingleTerm(term.raw_name, Estimate(term.beta.point, np.quantile(bootstrap_betas, 0.025), np.quantile(bootstrap_betas, 0.975)), bootstrap_pvalue)
-		else:
-			categories = {}
-			for sub_term_name, sub_term in term.categories.items():
-				bootstrap_betas = [r.terms[term_name].categories[sub_term_name].beta.point for r in bootstrap_results if sub_term_name in r.terms[term_name].categories]
-				bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
-				bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
-				categories[sub_term_name] = SingleTerm(sub_term.raw_name, Estimate(sub_term.beta.point, np.quantile(bootstrap_betas, 0.025), np.quantile(bootstrap_betas, 0.975)), bootstrap_pvalue)
-			terms[term_name] = CategoricalTerm(categories, term.ref_category)
-	
-	return RegressionResult(
-		None,
-		full_model.full_name, full_model.model_name, full_model.fit_method,
-		dep, full_model.nobs, full_model.dof_model, datetime.now(), full_model.cov_type,
-		terms,
-		full_model.llf, full_model.llnull,
-		full_model.dof_resid, full_model.rsquared, full_model.f_statistic,
-		full_model.exp
-	)
diff --git a/yli/regress.py b/yli/regress.py
index a97b3cd..1af1f11 100644
--- a/yli/regress.py
+++ b/yli/regress.py
@@ -22,6 +22,7 @@ import statsmodels
 import statsmodels.api as sm
 from statsmodels.iolib.table import SimpleTable
 from statsmodels.stats.outliers_influence import variance_inflation_factor
+from tqdm import tqdm
 
 from datetime import datetime
 import itertools
@@ -254,7 +255,7 @@ class RegressionResult:
 		left_col.append(('Method:', self.fit_method))
 		left_col.append(('Date:', self.fitted_dt.strftime('%Y-%m-%d')))
 		left_col.append(('Time:', self.fitted_dt.strftime('%H:%M:%S')))
-		left_col.append(('Std. Errors:', 'Non-Robust' if self.cov_type == 'nonrobust' else self.cov_type.upper()))
+		left_col.append(('Std. Errors:', 'Non-Robust' if self.cov_type == 'nonrobust' else self.cov_type.upper() if self.cov_type.startswith('hc') else self.cov_type))
 		
 		# Right column
 		right_col = []
@@ -440,7 +441,7 @@ def regress(
 	family=None,  # common model_kwargs
 	cov_type=None, maxiter=None, start_params=None,  # common fit_kwargs
 	bool_baselevels=False, exp=None,
-	_dmatrices=None, _ref_categories=None,
+	_dmatrices=None,
 ):
 	"""
 	Fit a statsmodels regression model
@@ -589,10 +590,7 @@ def regress(
 				else:
 					# Add a new categorical term if not exists
 					if column not in terms:
-						if _ref_categories is None:
-							ref_category = formula_factor_ref_category(formula, df, factor)
-						else:
-							ref_category = _ref_categories[column]
+						ref_category = formula_factor_ref_category(formula, df, factor)
 						terms[column] = CategoricalTerm({}, ref_category)
 					
 					terms[column].categories[contrast] = SingleTerm(raw_name, beta, pvalues[raw_name])
@@ -640,6 +638,102 @@ def regress(
 		exp
 	)
 
+def regress_bootstrap(
+	model_class, df, dep, formula, *,
+	nan_policy='warn',
+	model_kwargs=None, fit_kwargs=None,
+	samples=1000,
+	**kwargs
+):
+	"""
+	Fit a statsmodels regression model, using bootstrapping to compute confidence intervals and *p* values
+	
+	:param model_class: See :func:`yli.regress`
+	:param df: See :func:`yli.regress`
+	:param dep: See :func:`yli.regress`
+	:param formula: See :func:`yli.regress`
+	:param nan_policy: See :func:`yli.regress`
+	:param model_kwargs: See :func:`yli.regress`
+	:param fit_kwargs: See :func:`yli.regress`
+	:param samples: Number of bootstrap samples to draw
+	:type samples: int
+	:param kwargs: Other arguments to pass to :func:`yli.regress`
+	
+	:rtype: :class:`yli.regress.RegressionResult`
+	"""
+	
+	if model_kwargs is None:
+		model_kwargs = {}
+	if fit_kwargs is None:
+		fit_kwargs = {}
+	
+	# Check for/clean NaNs
+	# Following this, we pass nan_policy='raise' to assert no NaNs remaining
+	df = df[[dep] + cols_for_formula(formula, df)]
+	df = check_nan(df, nan_policy)
+	
+	# Ensure numeric type for dependent variable
+	if df[dep].dtype != 'float64':
+		df[dep] = df[dep].astype('float64')
+	
+	# Convert pandas nullable types for independent variables as this breaks statsmodels
+	df = convert_pandas_nullable(df)
+	
+	# Precompute design matrices
+	dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe')
+	
+	# Fit full model
+	full_model = regress(model_class, df, dep, formula, nan_policy='raise', _dmatrices=dmatrices, model_kwargs=model_kwargs, fit_kwargs=fit_kwargs, **kwargs)
+	
+	# Initialise bootstrap_results
+	bootstrap_results = {}  # Dict mapping term raw names to bootstrap betas
+	for term in full_model.terms.values():
+		if isinstance(term, SingleTerm):
+			bootstrap_results[term.raw_name] = []
+		else:
+			for sub_term in term.categories.values():
+				bootstrap_results[sub_term.raw_name] = []
+	
+	# Draw bootstrap samples and regress
+	dmatrices = dmatrices[0].join(dmatrices[1])
+	
+	for i in tqdm(range(samples)):
+		bootstrap_rows = dmatrices.sample(len(df), replace=True)
+		model = model_class(endog=bootstrap_rows.iloc[:,0], exog=bootstrap_rows.iloc[:,1:], **model_kwargs)
+		model.formula = dep + ' ~ ' + formula
+		
+		result = model.fit(disp=False, **fit_kwargs)
+		
+		for raw_name, raw_beta in zip(model.exog_names, result._results.params):
+			bootstrap_results[raw_name].append(raw_beta)
+	
+	# Combine bootstrap results
+	terms = {}
+	for term_name, term in full_model.terms.items():
+		if isinstance(term, SingleTerm):
+			bootstrap_betas = bootstrap_results[term.raw_name]
+			bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
+			bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
+			terms[term_name] = SingleTerm(term.raw_name, Estimate(term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue)
+		else:
+			categories = {}
+			for sub_term_name, sub_term in term.categories.items():
+				bootstrap_betas = bootstrap_results[sub_term.raw_name]
+				bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
+				bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
+				categories[sub_term_name] = SingleTerm(sub_term.raw_name, Estimate(sub_term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue)
+			terms[term_name] = CategoricalTerm(categories, term.ref_category)
+	
+	return RegressionResult(
+		None,
+		full_model.full_name, full_model.model_name, full_model.fit_method,
+		dep, full_model.nobs, full_model.dof_model, datetime.now(), 'Bootstrap',
+		terms,
+		full_model.llf, full_model.llnull,
+		full_model.dof_resid, full_model.rsquared, full_model.f_statistic,
+		full_model.exp
+	)
+
 def logit_then_regress(model_class, df, dep, formula, *, nan_policy='warn', **kwargs):
 	"""
 	Perform logistic regression, then use parameters as start parameters for desired regression