Compare commits
4 Commits
b7a66849ff
...
ed65dc3e8f
Author | SHA1 | Date | |
---|---|---|---|
ed65dc3e8f | |||
57e472ca09 | |||
a09a84f9ef | |||
1cc9aae3bd |
@ -53,6 +53,7 @@ Optional dependencies are:
|
||||
* [BFpack](https://cran.r-project.org/web/packages/BFpack/index.html), for *bayesfactor_afbf* (*RegressionModel.bayesfactor_beta_zero*)
|
||||
* [logistf](https://cran.r-project.org/web/packages/logistf/index.html), for *PenalisedLogit*
|
||||
* [shap](https://shap.readthedocs.io/en/latest/), for *RegressionModel.shap*
|
||||
* [tqdm](https://tqdm.github.io/), for *RegressionModel.bootstrap*
|
||||
|
||||
## Functions
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
# scipy-yli: Helpful SciPy utilities and recipes
|
||||
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
|
||||
# Copyright © 2022–2025 Lee Yingtong Li (RunasSudo)
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
@ -285,7 +285,7 @@ BETA_ODDSRATIO_CDF_TERMS = [100]
|
||||
beta_oddsratio = betaoddsrat_gen(name='beta_oddsratio', a=0) # a = lower bound of support
|
||||
|
||||
class transformed_gen(stats.rv_continuous):
|
||||
"""
|
||||
r"""
|
||||
Represents a transformation, *Y*, of a "base" random variable, *X*
|
||||
|
||||
This is a SciPy *stats.rv_continuous* distribution which takes parameters as described below.
|
||||
|
@ -1,5 +1,5 @@
|
||||
# scipy-yli: Helpful SciPy utilities and recipes
|
||||
# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
|
||||
# Copyright © 2022–2025 Lee Yingtong Li (RunasSudo)
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
@ -38,7 +38,6 @@ from .sig_tests import ChiSquaredResult, FTestResult
|
||||
from .utils import Estimate, PValueStyle, as_numeric, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term
|
||||
|
||||
# TODO: Documentation
|
||||
# TODO: Bootstrap
|
||||
|
||||
def vif(df, formula=None, *, nan_policy='warn'):
|
||||
"""
|
||||
@ -193,6 +192,7 @@ def regress(
|
||||
result.dep = dep
|
||||
result.formula = formula
|
||||
result.nan_policy = nan_policy
|
||||
result.fit_kwargs = fit_kwargs
|
||||
if exp is not None:
|
||||
result.exp = exp
|
||||
result.fitted_dt = datetime.now()
|
||||
@ -275,6 +275,7 @@ class RegressionModel:
|
||||
self.dep = None
|
||||
self.formula = None
|
||||
self.nan_policy = None
|
||||
self.fit_kwargs = None
|
||||
self.exp = False
|
||||
self.cov_type = None
|
||||
|
||||
@ -494,7 +495,7 @@ class RegressionModel:
|
||||
return out
|
||||
|
||||
def terms_flat(self):
|
||||
"""
|
||||
r"""
|
||||
Iterate over each :class:`SingleTerm` in *self.terms*, recursively stepping through :class:`CategoricalTerm`\ s
|
||||
"""
|
||||
|
||||
@ -585,6 +586,57 @@ class RegressionModel:
|
||||
|
||||
return 1 - (self.ll_model - self.dof_model) / self.ll_null
|
||||
|
||||
def bootstrap(self, samples=1000):
|
||||
"""
|
||||
Use bootstrapping to recompute confidence intervals and *p* values for the terms in the regression model
|
||||
|
||||
Mutates the current RegressionModel instance.
|
||||
|
||||
:param samples: Number of bootstrap samples to draw
|
||||
:type samples: int
|
||||
|
||||
:rtype: :class:`yli.regress.RegressionModel`
|
||||
"""
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
df = self.df()
|
||||
if df is None:
|
||||
raise Exception('Referenced DataFrame has been dropped')
|
||||
|
||||
# Preprocess data, check for NaN and get design matrices
|
||||
df_clean, dmatrices, dep_categories = df_to_dmatrices(df, self.dep, self.formula, self.nan_policy, [])
|
||||
|
||||
# Initialise bootstrap_results
|
||||
bootstrap_results = {} # Dict mapping term raw names to bootstrap betas
|
||||
for term in self.terms_flat():
|
||||
bootstrap_results[term.raw_name] = []
|
||||
|
||||
# Draw bootstrap samples and regress
|
||||
dmatrices = dmatrices[0].join(dmatrices[1])
|
||||
|
||||
for i in tqdm(range(samples)):
|
||||
bootstrap_rows = dmatrices.sample(len(df), replace=True)
|
||||
|
||||
# Fit model
|
||||
result = self.__class__.fit(bootstrap_rows.iloc[:,0], bootstrap_rows.iloc[:,1:], **self.fit_kwargs)
|
||||
|
||||
for term in result.terms_flat():
|
||||
bootstrap_results[term.raw_name].append(term.beta.point)
|
||||
|
||||
# Combine bootstrap results
|
||||
for term in self.terms_flat():
|
||||
bootstrap_betas = bootstrap_results[term.raw_name]
|
||||
bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
|
||||
bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
|
||||
|
||||
term.beta = Estimate(term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2))
|
||||
term.pvalue = bootstrap_pvalue
|
||||
|
||||
self.cov_type = 'Bootstrap'
|
||||
|
||||
return self
|
||||
|
||||
def shap(self, **kwargs):
|
||||
"""
|
||||
Compute SHAP values for the model
|
||||
@ -611,9 +663,9 @@ class RegressionModel:
|
||||
params.extend(s.beta.point for s in term.categories.values())
|
||||
|
||||
explainer = shap.LinearExplainer((np.array(params[1:]), params[0]), xdata, **kwargs) # FIXME: Assumes zeroth term is intercept
|
||||
shap_values = explainer.shap_values(xdata).astype('float')
|
||||
explanation = explainer(xdata)
|
||||
|
||||
return ShapResult(weakref.ref(self), shap_values, list(xdata.columns))
|
||||
return ShapResult(weakref.ref(self), explanation)
|
||||
|
||||
class LikelihoodRatioTestResult(ChiSquaredResult):
|
||||
"""
|
||||
|
35
yli/shap.py
35
yli/shap.py
@ -1,3 +1,19 @@
|
||||
# scipy-yli: Helpful SciPy utilities and recipes
|
||||
# Copyright © 2022–2025 Lee Yingtong Li (RunasSudo)
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import pandas as pd
|
||||
import patsy
|
||||
|
||||
@ -10,10 +26,9 @@ class ShapResult:
|
||||
See :meth:`yli.regress.RegressionModel.shap`.
|
||||
"""
|
||||
|
||||
def __init__(self, model, shap_values, features):
|
||||
def __init__(self, model, explanation):
|
||||
self.model = model
|
||||
self.shap_values = shap_values
|
||||
self.features = features
|
||||
self.explanation = explanation
|
||||
|
||||
@staticmethod
|
||||
def _get_xdata(model):
|
||||
@ -29,6 +44,7 @@ class ShapResult:
|
||||
df = check_nan(df, 'omit')
|
||||
|
||||
# Ensure numeric type for dependent variable
|
||||
# TODO: Is this step necessary?
|
||||
df[dep], dep_categories = as_numeric(df[dep])
|
||||
|
||||
# Convert pandas nullable types for independent variables as this breaks statsmodels
|
||||
@ -47,7 +63,8 @@ class ShapResult:
|
||||
:rtype: Series
|
||||
"""
|
||||
|
||||
return pd.Series(abs(self.shap_values).mean(axis=0), index=self.features)
|
||||
raise NotImplementedError()
|
||||
#return pd.Series(abs(self.shap_values).mean(axis=0), index=self.features)
|
||||
|
||||
def plot(self, **kwargs):
|
||||
"""
|
||||
@ -65,14 +82,6 @@ class ShapResult:
|
||||
if model is None:
|
||||
raise Exception('Referenced RegressionModel has been dropped')
|
||||
|
||||
xdata = self._get_xdata(model)
|
||||
|
||||
shap.summary_plot(self.shap_values, xdata, show=False, axis_color='black', **kwargs) # pass show=False to get gcf/gca
|
||||
|
||||
# Fix colour bar
|
||||
# https://stackoverflow.com/questions/70461753/shap-the-color-bar-is-not-displayed-in-the-summary-plot
|
||||
ax_colorbar = plt.gcf().axes[-1]
|
||||
ax_colorbar.set_aspect('auto')
|
||||
ax_colorbar.set_box_aspect(50)
|
||||
shap.plots.beeswarm(self.explanation, show=False, axis_color='black', max_display=None, **kwargs) # pass show=False to get gcf/gca
|
||||
|
||||
return plt.gcf(), plt.gca()
|
||||
|
14
yli/utils.py
14
yli/utils.py
@ -1,5 +1,5 @@
|
||||
# scipy-yli: Helpful SciPy utilities and recipes
|
||||
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
|
||||
# Copyright © 2022–2025 Lee Yingtong Li (RunasSudo)
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
@ -362,14 +362,10 @@ def cols_for_formula(formula, df):
|
||||
name = factor.name()
|
||||
if name.startswith('C('):
|
||||
# Contrasts expression
|
||||
# Get the corresponding factor_info
|
||||
factor_info = formula_get_factor_info(formula, df, name)
|
||||
|
||||
# Evaluate the factor
|
||||
categorical_box = factor_info.factor.eval(factor_info.state, df)
|
||||
|
||||
# Get the column name
|
||||
name = categorical_box.data.name
|
||||
if ',' in name:
|
||||
name = name[2:name.index(',')]
|
||||
else:
|
||||
name = name[2:name.index(')')]
|
||||
|
||||
cols.add(name)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user