From e71a1aea122f004d6f1b35907d50feebfc8bd3ff Mon Sep 17 00:00:00 2001
From: RunasSudo <runassudo@yingtongli.me>
Date: Tue, 29 Nov 2022 14:40:50 +1100
Subject: [PATCH] Implement bootstrap_regress

---
 yli/__init__.py  |   1 +
 yli/bootstrap.py | 103 +++++++++++++++++++++++++++++++++++++++++++++++
 yli/regress.py   |  79 ++++++++++++++++++++++++------------
 3 files changed, 157 insertions(+), 26 deletions(-)
 create mode 100644 yli/bootstrap.py

diff --git a/yli/__init__.py b/yli/__init__.py
index 476f901..e6aa3d3 100644
--- a/yli/__init__.py
+++ b/yli/__init__.py
@@ -15,6 +15,7 @@
 #   along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 from .bayes_factors import bayesfactor_afbf
+from .bootstrap import bootstrap_regress
 from .config import config
 from .descriptives import auto_descriptives
 from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
diff --git a/yli/bootstrap.py b/yli/bootstrap.py
new file mode 100644
index 0000000..0f46936
--- /dev/null
+++ b/yli/bootstrap.py
@@ -0,0 +1,103 @@
+#   scipy-yli: Helpful SciPy utilities and recipes
+#   Copyright © 2022  Lee Yingtong Li (RunasSudo)
+#
+#   This program is free software: you can redistribute it and/or modify
+#   it under the terms of the GNU Affero General Public License as published by
+#   the Free Software Foundation, either version 3 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU Affero General Public License for more details.
+#
+#   You should have received a copy of the GNU Affero General Public License
+#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import numpy as np
+import pandas as pd
+import patsy
+from tqdm import tqdm
+
+from datetime import datetime
+
+from .regress import CategoricalTerm, RegressionResult, SingleTerm, regress
+from .utils import Estimate, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term
+
+def bootstrap_regress(
+	model_class, df, dep, formula, *,
+	nan_policy='warn',
+	samples=1000,
+	**kwargs
+):
+	"""
+	Fit a statsmodels regression model, using bootstrapping to compute confidence intervals and *p* values
+	
+	:param model_class: See :func:`yli.regress`
+	:param df: See :func:`yli.regress`
+	:param dep: See :func:`yli.regress`
+	:param formula: See :func:`yli.regress`
+	:param nan_policy: See :func:`yli.regress`
+	:param samples: Number of bootstrap samples to draw
+	:type samples: int
+	:param kwargs: See :func:`yli.regress`
+	
+	:rtype: :class:`yli.regress.RegressionResult`
+	"""
+	
+	# Check for/clean NaNs
+	# Following this, we pass nan_policy='raise' to assert no NaNs remaining
+	df = df[[dep] + cols_for_formula(formula, df)]
+	df = check_nan(df, nan_policy)
+	
+	# Ensure numeric type for dependent variable
+	if df[dep].dtype != 'float64':
+		df[dep] = df[dep].astype('float64')
+	
+	# Convert pandas nullable types for independent variables as this breaks statsmodels
+	df = convert_pandas_nullable(df)
+	
+	# Precompute design matrices
+	dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe')
+	
+	# Fit full model
+	full_model = regress(model_class, df, dep, formula, nan_policy='raise', _dmatrices=dmatrices, **kwargs)
+	
+	# Cache reference categories
+	ref_categories = {term_name: term.ref_category for term_name, term in full_model.terms.items() if isinstance(term, CategoricalTerm)}
+	
+	# Draw bootstrap samples and regress
+	bootstrap_results = []
+	for i in tqdm(range(samples)):
+		#bootstrap_sample = df.sample(len(df), replace=True)
+		#bootstrap_results.append(regress(model_class, bootstrap_sample, dep, formula, nan_policy='raise', _dmatrices=dmatrices, _ref_categories=ref_categories, **kwargs))
+		bootstrap_rows = pd.Series(dmatrices[0].index).sample(len(df), replace=True)
+		bootstrap_dmatrices = (dmatrices[0].loc[bootstrap_rows], dmatrices[1].loc[bootstrap_rows])
+		bootstrap_results.append(regress(model_class, df, dep, formula, nan_policy='raise', _dmatrices=bootstrap_dmatrices, _ref_categories=ref_categories, **kwargs))
+	
+	# Combine bootstrap results
+	terms = {}
+	for term_name, term in full_model.terms.items():
+		if isinstance(term, SingleTerm):
+			bootstrap_betas = [r.terms[term_name].beta.point for r in bootstrap_results]
+			bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
+			bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
+			terms[term_name] = SingleTerm(term.raw_name, Estimate(term.beta.point, np.quantile(bootstrap_betas, 0.025), np.quantile(bootstrap_betas, 0.975)), bootstrap_pvalue)
+		else:
+			categories = {}
+			for sub_term_name, sub_term in term.categories.items():
+				bootstrap_betas = [r.terms[term_name].categories[sub_term_name].beta.point for r in bootstrap_results if sub_term_name in r.terms[term_name].categories]
+				bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
+				bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
+				categories[sub_term_name] = SingleTerm(sub_term.raw_name, Estimate(sub_term.beta.point, np.quantile(bootstrap_betas, 0.025), np.quantile(bootstrap_betas, 0.975)), bootstrap_pvalue)
+			terms[term_name] = CategoricalTerm(categories, term.ref_category)
+	
+	return RegressionResult(
+		None,
+		full_model.full_name, full_model.model_name, full_model.fit_method,
+		dep, full_model.nobs, full_model.dof_model, datetime.now(), full_model.cov_type,
+		terms,
+		full_model.llf, full_model.llnull,
+		full_model.dof_resid, full_model.rsquared, full_model.f_statistic,
+		full_model.exp
+	)
diff --git a/yli/regress.py b/yli/regress.py
index ad4ac83..a97b3cd 100644
--- a/yli/regress.py
+++ b/yli/regress.py
@@ -16,6 +16,7 @@
 
 import numpy as np
 import pandas as pd
+import patsy
 from scipy import stats
 import statsmodels
 import statsmodels.api as sm
@@ -438,7 +439,8 @@ def regress(
 	model_kwargs=None, fit_kwargs=None,
 	family=None,  # common model_kwargs
 	cov_type=None, maxiter=None, start_params=None,  # common fit_kwargs
-	bool_baselevels=False, exp=None
+	bool_baselevels=False, exp=None,
+	_dmatrices=None, _ref_categories=None,
 ):
 	"""
 	Fit a statsmodels regression model
@@ -526,19 +528,26 @@ def regress(
 		else:
 			exp = False
 	
-	# Check for/clean NaNs
-	df = df[[dep] + cols_for_formula(formula, df)]
-	df = check_nan(df, nan_policy)
-	
-	# Ensure numeric type for dependent variable
-	if df[dep].dtype != 'float64':
-		df[dep] = df[dep].astype('float64')
-	
-	# Convert pandas nullable types for independent variables as this breaks statsmodels
-	df = convert_pandas_nullable(df)
+	if _dmatrices is None:
+		# Check for/clean NaNs
+		df = df[[dep] + cols_for_formula(formula, df)]
+		df = check_nan(df, nan_policy)
+		
+		# Ensure numeric type for dependent variable
+		if df[dep].dtype != 'float64':
+			df[dep] = df[dep].astype('float64')
+		
+		# Convert pandas nullable types for independent variables as this breaks statsmodels
+		df = convert_pandas_nullable(df)
+		
+		# Construct design matrix from formula
+		dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe')
+	else:
+		dmatrices = _dmatrices
 	
 	# Fit model
-	model = model_class.from_formula(formula=dep + ' ~ ' + formula, data=df, **model_kwargs)
+	model = model_class(endog=dmatrices[0], exog=dmatrices[1], **model_kwargs)
+	model.formula = dep + ' ~ ' + formula
 	result = model.fit(disp=False, **fit_kwargs)
 	
 	if isinstance(result, RegressionResult):
@@ -553,16 +562,20 @@ def regress(
 	# Process terms
 	terms = {}
 	
-	confint = result.conf_int(config.alpha)
+	# Join term names manually because statsmodels wrapper is very slow
+	#confint = result.conf_int(config.alpha)
+	confint = {k: v for k, v in zip(model.exog_names, result._results.conf_int(config.alpha))}
+	pvalues = {k: v for k, v in zip(model.exog_names, result._results.pvalues)}
 	
-	for raw_name, raw_beta in result.params.items():
-		beta = Estimate(raw_beta, confint[0][raw_name], confint[1][raw_name])
+	#for raw_name, raw_beta in result.params.items():
+	for raw_name, raw_beta in zip(model.exog_names, result._results.params):
+		beta = Estimate(raw_beta, confint[raw_name][0], confint[raw_name][1])
 		
 		# Rename terms
 		if raw_name == 'Intercept':
 			# Intercept term (single term)
 			term = '(Intercept)'
-			terms[term] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
+			terms[term] = SingleTerm(raw_name, beta, pvalues[raw_name])
 		else:
 			# Parse if required
 			factor, column, contrast = parse_patsy_term(formula, df, raw_name)
@@ -572,28 +585,42 @@ def regress(
 				
 				if bool_baselevels is False and contrast == 'True' and set(df[column].unique()) == set([True, False]):
 					# Treat as single term
-					terms[column] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
+					terms[column] = SingleTerm(raw_name, beta, pvalues[raw_name])
 				else:
 					# Add a new categorical term if not exists
 					if column not in terms:
-						ref_category = formula_factor_ref_category(formula, df, factor)
+						if _ref_categories is None:
+							ref_category = formula_factor_ref_category(formula, df, factor)
+						else:
+							ref_category = _ref_categories[column]
 						terms[column] = CategoricalTerm({}, ref_category)
 					
-					terms[column].categories[contrast] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
+					terms[column].categories[contrast] = SingleTerm(raw_name, beta, pvalues[raw_name])
 			else:
 				# Single term
-				terms[column] = SingleTerm(raw_name, beta, result.pvalues[raw_name])
+				terms[column] = SingleTerm(raw_name, beta, pvalues[raw_name])
 	
 	# Fit null model (for llnull)
 	if hasattr(result, 'llnull'):
 		llnull = result.llnull
 	else:
-		result_null = model_class.from_formula(formula=dep + ' ~ 1', data=df).fit()
+		# Construct null (intercept-only) model
+		#result_null = model_class.from_formula(formula=dep + ' ~ 1', data=df).fit()
+		dm_exog = pd.DataFrame(index=dmatrices[0].index)
+		dm_exog['Intercept'] = pd.Series(dtype='float64')
+		dm_exog['Intercept'].fillna(1, inplace=True)
+		
+		result_null = model_class(endog=dmatrices[0], exog=dm_exog).fit()
 		llnull = result_null.llf
 	
-	# Parse raw regression results (to get fit method)
-	header_entries = np.vectorize(str.strip)(np.concatenate(np.split(np.array(result.summary().tables[0].data), 2, axis=1)))
-	header_dict = {x[0]: x[1] for x in header_entries}
+	if model_class is sm.OLS:
+		method_name = 'Least Squares'
+	else:
+		# Parse raw regression results to get fit method
+		# Avoid this in general as it can be expensive to summarise all the post hoc tests, etc.
+		header_entries = np.vectorize(str.strip)(np.concatenate(np.split(np.array(result.summary().tables[0].data), 2, axis=1)))
+		header_dict = {x[0]: x[1] for x in header_entries}
+		method_name = header_dict['Method:']
 	
 	# Get full name to display
 	if model_class is sm.Logit:
@@ -605,7 +632,7 @@ def regress(
 	
 	return RegressionResult(
 		result,
-		full_name, model_class.__name__, header_dict['Method:'],
+		full_name, model_class.__name__, method_name,
 		dep, result.nobs, result.df_model, datetime.now(), result.cov_type,
 		terms,
 		result.llf, llnull,
@@ -697,7 +724,7 @@ class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel):
 		import rpy2.robjects.pandas2ri
 		
 		# Assume data is already cleaned from regress()
-		df = self.data.frame.copy()
+		df = self.data.orig_endog.join(self.data.orig_exog)
 		
 		# Convert bool to int otherwise rpy2 chokes
 		df = df.replace({False: 0, True: 1})