Use shap "new" Explanation API

Re-implement RegressionModel.bootstrap
Remove cols_for_formula dependency on patsy
2025-01-29 00:59:25 +11:00 · 2025-01-28 19:31:40 +11:00 · 2025-01-28 18:41:40 +11:00 · 2025-01-28 18:40:07 +11:00
5 changed files with 87 additions and 29 deletions
--- a/README.md
+++ b/README.md
@ -53,6 +53,7 @@ Optional dependencies are:
 	* [BFpack](https://cran.r-project.org/web/packages/BFpack/index.html), for *bayesfactor_afbf* (*RegressionModel.bayesfactor_beta_zero*)
 	* [logistf](https://cran.r-project.org/web/packages/logistf/index.html), for *PenalisedLogit*
 * [shap](https://shap.readthedocs.io/en/latest/), for *RegressionModel.shap*
+* [tqdm](https://tqdm.github.io/), for *RegressionModel.bootstrap*

 ## Functions

--- a/yli/distributions.py
+++ b/yli/distributions.py
@ -1,5 +1,5 @@
 #   scipy-yli: Helpful SciPy utilities and recipes
-#   Copyright © 2022  Lee Yingtong Li (RunasSudo)
+#   Copyright © 2022–2025  Lee Yingtong Li (RunasSudo)
 #
 #   This program is free software: you can redistribute it and/or modify
 #   it under the terms of the GNU Affero General Public License as published by
@ -285,7 +285,7 @@ BETA_ODDSRATIO_CDF_TERMS = [100]
 beta_oddsratio = betaoddsrat_gen(name='beta_oddsratio', a=0)  # a = lower bound of support

 class transformed_gen(stats.rv_continuous):
-	"""
+	r"""
 	Represents a transformation, *Y*, of a "base" random variable, *X*
 	
 	This is a SciPy *stats.rv_continuous* distribution which takes parameters as described below.
--- a/yli/regress.py
+++ b/yli/regress.py
@ -1,5 +1,5 @@
 #   scipy-yli: Helpful SciPy utilities and recipes
-#   Copyright © 2022–2023  Lee Yingtong Li (RunasSudo)
+#   Copyright © 2022–2025  Lee Yingtong Li (RunasSudo)
 #
 #   This program is free software: you can redistribute it and/or modify
 #   it under the terms of the GNU Affero General Public License as published by
@ -38,7 +38,6 @@ from .sig_tests import ChiSquaredResult, FTestResult
 from .utils import Estimate, PValueStyle, as_numeric, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term

 # TODO: Documentation
-# TODO: Bootstrap

 def vif(df, formula=None, *, nan_policy='warn'):
 	"""
@ -193,6 +192,7 @@ def regress(
 	result.dep = dep
 	result.formula = formula
 	result.nan_policy = nan_policy
+	result.fit_kwargs = fit_kwargs
 	if exp is not None:
 		result.exp = exp
 	result.fitted_dt = datetime.now()
@ -275,6 +275,7 @@ class RegressionModel:
 		self.dep = None
 		self.formula = None
 		self.nan_policy = None
+		self.fit_kwargs = None
 		self.exp = False
 		self.cov_type = None
 		
@ -494,7 +495,7 @@ class RegressionModel:
 		return out
 	
 	def terms_flat(self):
-		"""
+		r"""
 		Iterate over each :class:`SingleTerm` in *self.terms*, recursively stepping through :class:`CategoricalTerm`\ s
 		"""
 		
@ -585,6 +586,57 @@ class RegressionModel:
 		
 		return 1 - (self.ll_model - self.dof_model) / self.ll_null
 	
+	def bootstrap(self, samples=1000):
+		"""
+		Use bootstrapping to recompute confidence intervals and *p* values for the terms in the regression model
+		
+		Mutates the current RegressionModel instance.
+		
+		:param samples: Number of bootstrap samples to draw
+		:type samples: int
+		
+		:rtype: :class:`yli.regress.RegressionModel`
+		"""
+		
+		from tqdm import tqdm
+		
+		df = self.df()
+		if df is None:
+			raise Exception('Referenced DataFrame has been dropped')
+		
+		# Preprocess data, check for NaN and get design matrices
+		df_clean, dmatrices, dep_categories = df_to_dmatrices(df, self.dep, self.formula, self.nan_policy, [])
+		
+		# Initialise bootstrap_results
+		bootstrap_results = {}  # Dict mapping term raw names to bootstrap betas
+		for term in self.terms_flat():
+			bootstrap_results[term.raw_name] = []
+		
+		# Draw bootstrap samples and regress
+		dmatrices = dmatrices[0].join(dmatrices[1])
+		
+		for i in tqdm(range(samples)):
+			bootstrap_rows = dmatrices.sample(len(df), replace=True)
+			
+			# Fit model
+			result = self.__class__.fit(bootstrap_rows.iloc[:,0], bootstrap_rows.iloc[:,1:], **self.fit_kwargs)
+			
+			for term in result.terms_flat():
+				bootstrap_results[term.raw_name].append(term.beta.point)
+		
+		# Combine bootstrap results
+		for term in self.terms_flat():
+			bootstrap_betas = bootstrap_results[term.raw_name]
+			bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
+			bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
+			
+			term.beta = Estimate(term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2))
+			term.pvalue = bootstrap_pvalue
+		
+		self.cov_type = 'Bootstrap'
+		
+		return self
+	
 	def shap(self, **kwargs):
 		"""
 		Compute SHAP values for the model
@ -611,9 +663,9 @@ class RegressionModel:
 				params.extend(s.beta.point for s in term.categories.values())
 		
 		explainer = shap.LinearExplainer((np.array(params[1:]), params[0]), xdata, **kwargs)  # FIXME: Assumes zeroth term is intercept
-		shap_values = explainer.shap_values(xdata).astype('float')
+		explanation = explainer(xdata)
 		
-		return ShapResult(weakref.ref(self), shap_values, list(xdata.columns))
+		return ShapResult(weakref.ref(self), explanation)

 class LikelihoodRatioTestResult(ChiSquaredResult):
 	"""
--- a/yli/shap.py
+++ b/yli/shap.py
@ -1,3 +1,19 @@
+#   scipy-yli: Helpful SciPy utilities and recipes
+#   Copyright © 2022–2025  Lee Yingtong Li (RunasSudo)
+#
+#   This program is free software: you can redistribute it and/or modify
+#   it under the terms of the GNU Affero General Public License as published by
+#   the Free Software Foundation, either version 3 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU Affero General Public License for more details.
+#
+#   You should have received a copy of the GNU Affero General Public License
+#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
 import pandas as pd
 import patsy

@ -10,10 +26,9 @@ class ShapResult:
 	See :meth:`yli.regress.RegressionModel.shap`.
 	"""
 	
-	def __init__(self, model, shap_values, features):
+	def __init__(self, model, explanation):
 		self.model = model
-		self.shap_values = shap_values
-		self.features = features
+		self.explanation = explanation
 	
 	@staticmethod
 	def _get_xdata(model):
@ -29,6 +44,7 @@ class ShapResult:
 		df = check_nan(df, 'omit')
 		
 		# Ensure numeric type for dependent variable
+		# TODO: Is this step necessary?
 		df[dep], dep_categories = as_numeric(df[dep])
 		
 		# Convert pandas nullable types for independent variables as this breaks statsmodels
@ -47,7 +63,8 @@ class ShapResult:
 		:rtype: Series
 		"""
 		
-		return pd.Series(abs(self.shap_values).mean(axis=0), index=self.features)
+		raise NotImplementedError()
+		#return pd.Series(abs(self.shap_values).mean(axis=0), index=self.features)
 	
 	def plot(self, **kwargs):
 		"""
@ -65,14 +82,6 @@ class ShapResult:
 		if model is None:
 			raise Exception('Referenced RegressionModel has been dropped')
 		
-		xdata = self._get_xdata(model)
-		
-		shap.summary_plot(self.shap_values, xdata, show=False, axis_color='black', **kwargs)  # pass show=False to get gcf/gca
-		
-		# Fix colour bar
-		# https://stackoverflow.com/questions/70461753/shap-the-color-bar-is-not-displayed-in-the-summary-plot
-		ax_colorbar = plt.gcf().axes[-1]
-		ax_colorbar.set_aspect('auto')
-		ax_colorbar.set_box_aspect(50) 
+		shap.plots.beeswarm(self.explanation, show=False, axis_color='black', max_display=None, **kwargs)  # pass show=False to get gcf/gca
 		
 		return plt.gcf(), plt.gca()
--- a/yli/utils.py
+++ b/yli/utils.py
@ -1,5 +1,5 @@
 #   scipy-yli: Helpful SciPy utilities and recipes
-#   Copyright © 2022  Lee Yingtong Li (RunasSudo)
+#   Copyright © 2022–2025  Lee Yingtong Li (RunasSudo)
 #
 #   This program is free software: you can redistribute it and/or modify
 #   it under the terms of the GNU Affero General Public License as published by
@ -362,14 +362,10 @@ def cols_for_formula(formula, df):
 			name = factor.name()
 			if name.startswith('C('):
 				# Contrasts expression
-				# Get the corresponding factor_info
-				factor_info = formula_get_factor_info(formula, df, name)
-				
-				# Evaluate the factor
-				categorical_box = factor_info.factor.eval(factor_info.state, df)
-				
-				# Get the column name
-				name = categorical_box.data.name
+				if ',' in name:
+					name = name[2:name.index(',')]
+				else:
+					name = name[2:name.index(')')]
 			
 			cols.add(name)
Author	SHA1	Message	Date
RunasSudo	ed65dc3e8f	Use shap "new" Explanation API	2025-01-29 00:59:25 +11:00
RunasSudo	57e472ca09	Re-implement RegressionModel.bootstrap	2025-01-28 19:31:40 +11:00
RunasSudo	a09a84f9ef	Remove cols_for_formula dependency on patsy	2025-01-28 18:41:40 +11:00
RunasSudo	1cc9aae3bd	Minor fixups	2025-01-28 18:40:07 +11:00