Compare commits

...

4 Commits

Author SHA1 Message Date
ed65dc3e8f
Use shap "new" Explanation API 2025-01-29 00:59:25 +11:00
57e472ca09
Re-implement RegressionModel.bootstrap 2025-01-28 19:31:40 +11:00
a09a84f9ef
Remove cols_for_formula dependency on patsy 2025-01-28 18:41:40 +11:00
1cc9aae3bd
Minor fixups 2025-01-28 18:40:07 +11:00
5 changed files with 87 additions and 29 deletions

View File

@ -53,6 +53,7 @@ Optional dependencies are:
* [BFpack](https://cran.r-project.org/web/packages/BFpack/index.html), for *bayesfactor_afbf* (*RegressionModel.bayesfactor_beta_zero*)
* [logistf](https://cran.r-project.org/web/packages/logistf/index.html), for *PenalisedLogit*
* [shap](https://shap.readthedocs.io/en/latest/), for *RegressionModel.shap*
* [tqdm](https://tqdm.github.io/), for *RegressionModel.bootstrap*
## Functions

View File

@ -1,5 +1,5 @@
# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
# Copyright © 2022–2025 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
@ -285,7 +285,7 @@ BETA_ODDSRATIO_CDF_TERMS = [100]
beta_oddsratio = betaoddsrat_gen(name='beta_oddsratio', a=0) # a = lower bound of support
class transformed_gen(stats.rv_continuous):
"""
r"""
Represents a transformation, *Y*, of a "base" random variable, *X*
This is a SciPy *stats.rv_continuous* distribution which takes parameters as described below.

View File

@ -1,5 +1,5 @@
# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
# Copyright © 2022–2025 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
@ -38,7 +38,6 @@ from .sig_tests import ChiSquaredResult, FTestResult
from .utils import Estimate, PValueStyle, as_numeric, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term
# TODO: Documentation
# TODO: Bootstrap
def vif(df, formula=None, *, nan_policy='warn'):
"""
@ -193,6 +192,7 @@ def regress(
result.dep = dep
result.formula = formula
result.nan_policy = nan_policy
result.fit_kwargs = fit_kwargs
if exp is not None:
result.exp = exp
result.fitted_dt = datetime.now()
@ -275,6 +275,7 @@ class RegressionModel:
self.dep = None
self.formula = None
self.nan_policy = None
self.fit_kwargs = None
self.exp = False
self.cov_type = None
@ -494,7 +495,7 @@ class RegressionModel:
return out
def terms_flat(self):
"""
r"""
Iterate over each :class:`SingleTerm` in *self.terms*, recursively stepping through :class:`CategoricalTerm`\ s
"""
@ -585,6 +586,57 @@ class RegressionModel:
return 1 - (self.ll_model - self.dof_model) / self.ll_null
def bootstrap(self, samples=1000):
"""
Use bootstrapping to recompute confidence intervals and *p* values for the terms in the regression model
Mutates the current RegressionModel instance.
:param samples: Number of bootstrap samples to draw
:type samples: int
:rtype: :class:`yli.regress.RegressionModel`
"""
from tqdm import tqdm
df = self.df()
if df is None:
raise Exception('Referenced DataFrame has been dropped')
# Preprocess data, check for NaN and get design matrices
df_clean, dmatrices, dep_categories = df_to_dmatrices(df, self.dep, self.formula, self.nan_policy, [])
# Initialise bootstrap_results
bootstrap_results = {} # Dict mapping term raw names to bootstrap betas
for term in self.terms_flat():
bootstrap_results[term.raw_name] = []
# Draw bootstrap samples and regress
dmatrices = dmatrices[0].join(dmatrices[1])
for i in tqdm(range(samples)):
bootstrap_rows = dmatrices.sample(len(df), replace=True)
# Fit model
result = self.__class__.fit(bootstrap_rows.iloc[:,0], bootstrap_rows.iloc[:,1:], **self.fit_kwargs)
for term in result.terms_flat():
bootstrap_results[term.raw_name].append(term.beta.point)
# Combine bootstrap results
for term in self.terms_flat():
bootstrap_betas = bootstrap_results[term.raw_name]
bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
term.beta = Estimate(term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2))
term.pvalue = bootstrap_pvalue
self.cov_type = 'Bootstrap'
return self
def shap(self, **kwargs):
"""
Compute SHAP values for the model
@ -611,9 +663,9 @@ class RegressionModel:
params.extend(s.beta.point for s in term.categories.values())
explainer = shap.LinearExplainer((np.array(params[1:]), params[0]), xdata, **kwargs) # FIXME: Assumes zeroth term is intercept
shap_values = explainer.shap_values(xdata).astype('float')
explanation = explainer(xdata)
return ShapResult(weakref.ref(self), shap_values, list(xdata.columns))
return ShapResult(weakref.ref(self), explanation)
class LikelihoodRatioTestResult(ChiSquaredResult):
"""

View File

@ -1,3 +1,19 @@
# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022–2025 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import pandas as pd
import patsy
@ -10,10 +26,9 @@ class ShapResult:
See :meth:`yli.regress.RegressionModel.shap`.
"""
def __init__(self, model, shap_values, features):
def __init__(self, model, explanation):
self.model = model
self.shap_values = shap_values
self.features = features
self.explanation = explanation
@staticmethod
def _get_xdata(model):
@ -29,6 +44,7 @@ class ShapResult:
df = check_nan(df, 'omit')
# Ensure numeric type for dependent variable
# TODO: Is this step necessary?
df[dep], dep_categories = as_numeric(df[dep])
# Convert pandas nullable types for independent variables as this breaks statsmodels
@ -47,7 +63,8 @@ class ShapResult:
:rtype: Series
"""
return pd.Series(abs(self.shap_values).mean(axis=0), index=self.features)
raise NotImplementedError()
#return pd.Series(abs(self.shap_values).mean(axis=0), index=self.features)
def plot(self, **kwargs):
"""
@ -65,14 +82,6 @@ class ShapResult:
if model is None:
raise Exception('Referenced RegressionModel has been dropped')
xdata = self._get_xdata(model)
shap.summary_plot(self.shap_values, xdata, show=False, axis_color='black', **kwargs) # pass show=False to get gcf/gca
# Fix colour bar
# https://stackoverflow.com/questions/70461753/shap-the-color-bar-is-not-displayed-in-the-summary-plot
ax_colorbar = plt.gcf().axes[-1]
ax_colorbar.set_aspect('auto')
ax_colorbar.set_box_aspect(50)
shap.plots.beeswarm(self.explanation, show=False, axis_color='black', max_display=None, **kwargs) # pass show=False to get gcf/gca
return plt.gcf(), plt.gca()

View File

@ -1,5 +1,5 @@
# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
# Copyright © 2022–2025 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
@ -362,14 +362,10 @@ def cols_for_formula(formula, df):
name = factor.name()
if name.startswith('C('):
# Contrasts expression
# Get the corresponding factor_info
factor_info = formula_get_factor_info(formula, df, name)
# Evaluate the factor
categorical_box = factor_info.factor.eval(factor_info.state, df)
# Get the column name
name = categorical_box.data.name
if ',' in name:
name = name[2:name.index(',')]
else:
name = name[2:name.index(')')]
cols.add(name)