Implement utilities for SHAP values in regression

2023-02-07 18:50:07 +11:00 · 2023-02-07 18:50:07 +11:00 · 967b853b02
commit 967b853b02
parent dbebc3b8e9
2 changed files with 81 additions and 0 deletions
--- a/yli/regress.py
+++ b/yli/regress.py
@ -32,6 +32,7 @@ import weakref
 from .bayes_factors import BayesFactor, bayesfactor_afbf
 from .config import config
 from .shap import ShapResult
 from .sig_tests import ChiSquaredResult, FTestResult
 from .utils import Estimate, PValueStyle, as_numeric, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term
@ -460,6 +461,26 @@ class RegressionResult:
 			self.exp
 		)
 	def shap(self, **kwargs):
 		# TODO: Documentation
 		import shap
 		xdata = ShapResult._get_xdata(self)
 		# Combine terms into single list
 		params = []
 		for term in self.terms.values():
 			if isinstance(term, SingleTerm):
 				params.append(term.beta.point)
 			else:
 				params.extend(s.beta.point for s in term.categories.values())
 		explainer = shap.LinearExplainer((np.array(params[1:]), params[0]), xdata, **kwargs)  # FIXME: Assumes zeroth term is intercept
 		shap_values = explainer.shap_values(xdata).astype('float')
 		return ShapResult(weakref.ref(self), shap_values, list(xdata.columns))
 	def _header_table(self, html):
 		"""Return the entries for the header table"""
--- a/yli/shap.py
+++ b/yli/shap.py
@ -0,0 +1,60 @@
 import pandas as pd
 import patsy
 from .utils import as_numeric, check_nan, cols_for_formula, convert_pandas_nullable
 class ShapResult:
 	# TODO: Documentation
 	def __init__(self, model, shap_values, features):
 		self.model = model
 		self.shap_values = shap_values
 		self.features = features
 	@staticmethod
 	def _get_xdata(model):
 		df = model.df()
 		if df is None:
 			raise Exception('Referenced DataFrame has been dropped')
 		dep = model.dep
 		# Check for/clean NaNs
 		# NaN warning/error will already have been handled in regress, so here we pass nan_policy='omit'
 		# Following this, we pass nan_policy='raise' to assert no NaNs remaining
 		df = df[[dep] + cols_for_formula(model.formula, df)]
 		df = check_nan(df, 'omit')
 		# Ensure numeric type for dependent variable
 		df[dep], dep_categories = as_numeric(df[dep])
 		# Convert pandas nullable types for independent variables as this breaks statsmodels
 		df = convert_pandas_nullable(df)
 		# Get xdata for SHAP
 		dmatrix = patsy.dmatrix(model.formula, df, return_type='dataframe')
 		xdata = dmatrix.iloc[:, 1:]  # FIXME: Assumes zeroth term is intercept
 		return xdata
 	def mean(self):
 		return pd.Series(abs(self.shap_values).mean(axis=0), index=self.features)
 	def plot(self, **kwargs):
 		import matplotlib.pyplot as plt
 		import shap
 		model = self.model()
 		if model is None:
 			raise Exception('Referenced RegressionResult has been dropped')
 		xdata = self._get_xdata(model)
 		shap.summary_plot(self.shap_values, xdata, show=False, axis_color='black', **kwargs)  # pass show=False to get gcf/gca
 		# Fix colour bar
 		# https://stackoverflow.com/questions/70461753/shap-the-color-bar-is-not-displayed-in-the-summary-plot
 		ax_colorbar = plt.gcf().axes[-1]
 		ax_colorbar.set_aspect('auto')
 		ax_colorbar.set_box_aspect(50) 
 		return plt.gcf(), plt.gca()