Compare commits

..

No commits in common. "ed65dc3e8f95cf22b73198a7896038e06550849b" and "b7a66849ff370a70e3cbd13267dc2f670dc27c4f" have entirely different histories.

5 changed files with 29 additions and 87 deletions

View File

@ -53,7 +53,6 @@ Optional dependencies are:
* [BFpack](https://cran.r-project.org/web/packages/BFpack/index.html), for *bayesfactor_afbf* (*RegressionModel.bayesfactor_beta_zero*)
* [logistf](https://cran.r-project.org/web/packages/logistf/index.html), for *PenalisedLogit*
* [shap](https://shap.readthedocs.io/en/latest/), for *RegressionModel.shap*
* [tqdm](https://tqdm.github.io/), for *RegressionModel.bootstrap*
## Functions

View File

@ -1,5 +1,5 @@
# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022–2025 Lee Yingtong Li (RunasSudo)
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
@ -285,7 +285,7 @@ BETA_ODDSRATIO_CDF_TERMS = [100]
beta_oddsratio = betaoddsrat_gen(name='beta_oddsratio', a=0) # a = lower bound of support
class transformed_gen(stats.rv_continuous):
r"""
"""
Represents a transformation, *Y*, of a "base" random variable, *X*
This is a SciPy *stats.rv_continuous* distribution which takes parameters as described below.

View File

@ -1,5 +1,5 @@
# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022–2025 Lee Yingtong Li (RunasSudo)
# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
@ -38,6 +38,7 @@ from .sig_tests import ChiSquaredResult, FTestResult
from .utils import Estimate, PValueStyle, as_numeric, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term
# TODO: Documentation
# TODO: Bootstrap
def vif(df, formula=None, *, nan_policy='warn'):
"""
@ -192,7 +193,6 @@ def regress(
result.dep = dep
result.formula = formula
result.nan_policy = nan_policy
result.fit_kwargs = fit_kwargs
if exp is not None:
result.exp = exp
result.fitted_dt = datetime.now()
@ -275,7 +275,6 @@ class RegressionModel:
self.dep = None
self.formula = None
self.nan_policy = None
self.fit_kwargs = None
self.exp = False
self.cov_type = None
@ -495,7 +494,7 @@ class RegressionModel:
return out
def terms_flat(self):
r"""
"""
Iterate over each :class:`SingleTerm` in *self.terms*, recursively stepping through :class:`CategoricalTerm`\ s
"""
@ -586,57 +585,6 @@ class RegressionModel:
return 1 - (self.ll_model - self.dof_model) / self.ll_null
def bootstrap(self, samples=1000):
"""
Use bootstrapping to recompute confidence intervals and *p* values for the terms in the regression model
Mutates the current RegressionModel instance.
:param samples: Number of bootstrap samples to draw
:type samples: int
:rtype: :class:`yli.regress.RegressionModel`
"""
from tqdm import tqdm
df = self.df()
if df is None:
raise Exception('Referenced DataFrame has been dropped')
# Preprocess data, check for NaN and get design matrices
df_clean, dmatrices, dep_categories = df_to_dmatrices(df, self.dep, self.formula, self.nan_policy, [])
# Initialise bootstrap_results
bootstrap_results = {} # Dict mapping term raw names to bootstrap betas
for term in self.terms_flat():
bootstrap_results[term.raw_name] = []
# Draw bootstrap samples and regress
dmatrices = dmatrices[0].join(dmatrices[1])
for i in tqdm(range(samples)):
bootstrap_rows = dmatrices.sample(len(df), replace=True)
# Fit model
result = self.__class__.fit(bootstrap_rows.iloc[:,0], bootstrap_rows.iloc[:,1:], **self.fit_kwargs)
for term in result.terms_flat():
bootstrap_results[term.raw_name].append(term.beta.point)
# Combine bootstrap results
for term in self.terms_flat():
bootstrap_betas = bootstrap_results[term.raw_name]
bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
term.beta = Estimate(term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2))
term.pvalue = bootstrap_pvalue
self.cov_type = 'Bootstrap'
return self
def shap(self, **kwargs):
"""
Compute SHAP values for the model
@ -663,9 +611,9 @@ class RegressionModel:
params.extend(s.beta.point for s in term.categories.values())
explainer = shap.LinearExplainer((np.array(params[1:]), params[0]), xdata, **kwargs) # FIXME: Assumes zeroth term is intercept
explanation = explainer(xdata)
shap_values = explainer.shap_values(xdata).astype('float')
return ShapResult(weakref.ref(self), explanation)
return ShapResult(weakref.ref(self), shap_values, list(xdata.columns))
class LikelihoodRatioTestResult(ChiSquaredResult):
"""

View File

@ -1,19 +1,3 @@
# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022–2025 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import pandas as pd
import patsy
@ -26,9 +10,10 @@ class ShapResult:
See :meth:`yli.regress.RegressionModel.shap`.
"""
def __init__(self, model, explanation):
def __init__(self, model, shap_values, features):
self.model = model
self.explanation = explanation
self.shap_values = shap_values
self.features = features
@staticmethod
def _get_xdata(model):
@ -44,7 +29,6 @@ class ShapResult:
df = check_nan(df, 'omit')
# Ensure numeric type for dependent variable
# TODO: Is this step necessary?
df[dep], dep_categories = as_numeric(df[dep])
# Convert pandas nullable types for independent variables as this breaks statsmodels
@ -63,8 +47,7 @@ class ShapResult:
:rtype: Series
"""
raise NotImplementedError()
#return pd.Series(abs(self.shap_values).mean(axis=0), index=self.features)
return pd.Series(abs(self.shap_values).mean(axis=0), index=self.features)
def plot(self, **kwargs):
"""
@ -82,6 +65,14 @@ class ShapResult:
if model is None:
raise Exception('Referenced RegressionModel has been dropped')
shap.plots.beeswarm(self.explanation, show=False, axis_color='black', max_display=None, **kwargs) # pass show=False to get gcf/gca
xdata = self._get_xdata(model)
shap.summary_plot(self.shap_values, xdata, show=False, axis_color='black', **kwargs) # pass show=False to get gcf/gca
# Fix colour bar
# https://stackoverflow.com/questions/70461753/shap-the-color-bar-is-not-displayed-in-the-summary-plot
ax_colorbar = plt.gcf().axes[-1]
ax_colorbar.set_aspect('auto')
ax_colorbar.set_box_aspect(50)
return plt.gcf(), plt.gca()

View File

@ -1,5 +1,5 @@
# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022–2025 Lee Yingtong Li (RunasSudo)
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
@ -362,10 +362,14 @@ def cols_for_formula(formula, df):
name = factor.name()
if name.startswith('C('):
# Contrasts expression
if ',' in name:
name = name[2:name.index(',')]
else:
name = name[2:name.index(')')]
# Get the corresponding factor_info
factor_info = formula_get_factor_info(formula, df, name)
# Evaluate the factor
categorical_box = factor_info.factor.eval(factor_info.state, df)
# Get the column name
name = categorical_box.data.name
cols.add(name)