Rename bootstrap_regress to regress_bootstrap and update
Improve performance by directly calling statsmodels regression Report bootstrap in model summary Add to documentation
This commit is contained in:
parent
e71a1aea12
commit
645cb7a85e
@ -10,6 +10,8 @@ Functions
|
|||||||
|
|
||||||
.. autofunction:: yli.regress
|
.. autofunction:: yli.regress
|
||||||
|
|
||||||
|
.. autofunction:: yli.regress_bootstrap
|
||||||
|
|
||||||
.. autofunction:: yli.vif
|
.. autofunction:: yli.vif
|
||||||
|
|
||||||
Result classes
|
Result classes
|
||||||
|
@ -15,12 +15,11 @@
|
|||||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
from .bayes_factors import bayesfactor_afbf
|
from .bayes_factors import bayesfactor_afbf
|
||||||
from .bootstrap import bootstrap_regress
|
|
||||||
from .config import config
|
from .config import config
|
||||||
from .descriptives import auto_descriptives
|
from .descriptives import auto_descriptives
|
||||||
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
|
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
|
||||||
from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
|
from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
|
||||||
from .regress import PenalisedLogit, logit_then_regress, regress, vif
|
from .regress import PenalisedLogit, logit_then_regress, regress, regress_bootstrap, vif
|
||||||
from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, ttest_ind
|
from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, ttest_ind
|
||||||
|
|
||||||
def reload_me():
|
def reload_me():
|
||||||
|
103
yli/bootstrap.py
103
yli/bootstrap.py
@ -1,103 +0,0 @@
|
|||||||
# scipy-yli: Helpful SciPy utilities and recipes
|
|
||||||
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
|
|
||||||
#
|
|
||||||
# This program is free software: you can redistribute it and/or modify
|
|
||||||
# it under the terms of the GNU Affero General Public License as published by
|
|
||||||
# the Free Software Foundation, either version 3 of the License, or
|
|
||||||
# (at your option) any later version.
|
|
||||||
#
|
|
||||||
# This program is distributed in the hope that it will be useful,
|
|
||||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
# GNU Affero General Public License for more details.
|
|
||||||
#
|
|
||||||
# You should have received a copy of the GNU Affero General Public License
|
|
||||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
import patsy
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
from .regress import CategoricalTerm, RegressionResult, SingleTerm, regress
|
|
||||||
from .utils import Estimate, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term
|
|
||||||
|
|
||||||
def bootstrap_regress(
|
|
||||||
model_class, df, dep, formula, *,
|
|
||||||
nan_policy='warn',
|
|
||||||
samples=1000,
|
|
||||||
**kwargs
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Fit a statsmodels regression model, using bootstrapping to compute confidence intervals and *p* values
|
|
||||||
|
|
||||||
:param model_class: See :func:`yli.regress`
|
|
||||||
:param df: See :func:`yli.regress`
|
|
||||||
:param dep: See :func:`yli.regress`
|
|
||||||
:param formula: See :func:`yli.regress`
|
|
||||||
:param nan_policy: See :func:`yli.regress`
|
|
||||||
:param samples: Number of bootstrap samples to draw
|
|
||||||
:type samples: int
|
|
||||||
:param kwargs: See :func:`yli.regress`
|
|
||||||
|
|
||||||
:rtype: :class:`yli.regress.RegressionResult`
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Check for/clean NaNs
|
|
||||||
# Following this, we pass nan_policy='raise' to assert no NaNs remaining
|
|
||||||
df = df[[dep] + cols_for_formula(formula, df)]
|
|
||||||
df = check_nan(df, nan_policy)
|
|
||||||
|
|
||||||
# Ensure numeric type for dependent variable
|
|
||||||
if df[dep].dtype != 'float64':
|
|
||||||
df[dep] = df[dep].astype('float64')
|
|
||||||
|
|
||||||
# Convert pandas nullable types for independent variables as this breaks statsmodels
|
|
||||||
df = convert_pandas_nullable(df)
|
|
||||||
|
|
||||||
# Precompute design matrices
|
|
||||||
dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe')
|
|
||||||
|
|
||||||
# Fit full model
|
|
||||||
full_model = regress(model_class, df, dep, formula, nan_policy='raise', _dmatrices=dmatrices, **kwargs)
|
|
||||||
|
|
||||||
# Cache reference categories
|
|
||||||
ref_categories = {term_name: term.ref_category for term_name, term in full_model.terms.items() if isinstance(term, CategoricalTerm)}
|
|
||||||
|
|
||||||
# Draw bootstrap samples and regress
|
|
||||||
bootstrap_results = []
|
|
||||||
for i in tqdm(range(samples)):
|
|
||||||
#bootstrap_sample = df.sample(len(df), replace=True)
|
|
||||||
#bootstrap_results.append(regress(model_class, bootstrap_sample, dep, formula, nan_policy='raise', _dmatrices=dmatrices, _ref_categories=ref_categories, **kwargs))
|
|
||||||
bootstrap_rows = pd.Series(dmatrices[0].index).sample(len(df), replace=True)
|
|
||||||
bootstrap_dmatrices = (dmatrices[0].loc[bootstrap_rows], dmatrices[1].loc[bootstrap_rows])
|
|
||||||
bootstrap_results.append(regress(model_class, df, dep, formula, nan_policy='raise', _dmatrices=bootstrap_dmatrices, _ref_categories=ref_categories, **kwargs))
|
|
||||||
|
|
||||||
# Combine bootstrap results
|
|
||||||
terms = {}
|
|
||||||
for term_name, term in full_model.terms.items():
|
|
||||||
if isinstance(term, SingleTerm):
|
|
||||||
bootstrap_betas = [r.terms[term_name].beta.point for r in bootstrap_results]
|
|
||||||
bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
|
|
||||||
bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
|
|
||||||
terms[term_name] = SingleTerm(term.raw_name, Estimate(term.beta.point, np.quantile(bootstrap_betas, 0.025), np.quantile(bootstrap_betas, 0.975)), bootstrap_pvalue)
|
|
||||||
else:
|
|
||||||
categories = {}
|
|
||||||
for sub_term_name, sub_term in term.categories.items():
|
|
||||||
bootstrap_betas = [r.terms[term_name].categories[sub_term_name].beta.point for r in bootstrap_results if sub_term_name in r.terms[term_name].categories]
|
|
||||||
bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
|
|
||||||
bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
|
|
||||||
categories[sub_term_name] = SingleTerm(sub_term.raw_name, Estimate(sub_term.beta.point, np.quantile(bootstrap_betas, 0.025), np.quantile(bootstrap_betas, 0.975)), bootstrap_pvalue)
|
|
||||||
terms[term_name] = CategoricalTerm(categories, term.ref_category)
|
|
||||||
|
|
||||||
return RegressionResult(
|
|
||||||
None,
|
|
||||||
full_model.full_name, full_model.model_name, full_model.fit_method,
|
|
||||||
dep, full_model.nobs, full_model.dof_model, datetime.now(), full_model.cov_type,
|
|
||||||
terms,
|
|
||||||
full_model.llf, full_model.llnull,
|
|
||||||
full_model.dof_resid, full_model.rsquared, full_model.f_statistic,
|
|
||||||
full_model.exp
|
|
||||||
)
|
|
106
yli/regress.py
106
yli/regress.py
@ -22,6 +22,7 @@ import statsmodels
|
|||||||
import statsmodels.api as sm
|
import statsmodels.api as sm
|
||||||
from statsmodels.iolib.table import SimpleTable
|
from statsmodels.iolib.table import SimpleTable
|
||||||
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import itertools
|
import itertools
|
||||||
@ -254,7 +255,7 @@ class RegressionResult:
|
|||||||
left_col.append(('Method:', self.fit_method))
|
left_col.append(('Method:', self.fit_method))
|
||||||
left_col.append(('Date:', self.fitted_dt.strftime('%Y-%m-%d')))
|
left_col.append(('Date:', self.fitted_dt.strftime('%Y-%m-%d')))
|
||||||
left_col.append(('Time:', self.fitted_dt.strftime('%H:%M:%S')))
|
left_col.append(('Time:', self.fitted_dt.strftime('%H:%M:%S')))
|
||||||
left_col.append(('Std. Errors:', 'Non-Robust' if self.cov_type == 'nonrobust' else self.cov_type.upper()))
|
left_col.append(('Std. Errors:', 'Non-Robust' if self.cov_type == 'nonrobust' else self.cov_type.upper() if self.cov_type.startswith('hc') else self.cov_type))
|
||||||
|
|
||||||
# Right column
|
# Right column
|
||||||
right_col = []
|
right_col = []
|
||||||
@ -440,7 +441,7 @@ def regress(
|
|||||||
family=None, # common model_kwargs
|
family=None, # common model_kwargs
|
||||||
cov_type=None, maxiter=None, start_params=None, # common fit_kwargs
|
cov_type=None, maxiter=None, start_params=None, # common fit_kwargs
|
||||||
bool_baselevels=False, exp=None,
|
bool_baselevels=False, exp=None,
|
||||||
_dmatrices=None, _ref_categories=None,
|
_dmatrices=None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Fit a statsmodels regression model
|
Fit a statsmodels regression model
|
||||||
@ -589,10 +590,7 @@ def regress(
|
|||||||
else:
|
else:
|
||||||
# Add a new categorical term if not exists
|
# Add a new categorical term if not exists
|
||||||
if column not in terms:
|
if column not in terms:
|
||||||
if _ref_categories is None:
|
ref_category = formula_factor_ref_category(formula, df, factor)
|
||||||
ref_category = formula_factor_ref_category(formula, df, factor)
|
|
||||||
else:
|
|
||||||
ref_category = _ref_categories[column]
|
|
||||||
terms[column] = CategoricalTerm({}, ref_category)
|
terms[column] = CategoricalTerm({}, ref_category)
|
||||||
|
|
||||||
terms[column].categories[contrast] = SingleTerm(raw_name, beta, pvalues[raw_name])
|
terms[column].categories[contrast] = SingleTerm(raw_name, beta, pvalues[raw_name])
|
||||||
@ -640,6 +638,102 @@ def regress(
|
|||||||
exp
|
exp
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def regress_bootstrap(
|
||||||
|
model_class, df, dep, formula, *,
|
||||||
|
nan_policy='warn',
|
||||||
|
model_kwargs=None, fit_kwargs=None,
|
||||||
|
samples=1000,
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Fit a statsmodels regression model, using bootstrapping to compute confidence intervals and *p* values
|
||||||
|
|
||||||
|
:param model_class: See :func:`yli.regress`
|
||||||
|
:param df: See :func:`yli.regress`
|
||||||
|
:param dep: See :func:`yli.regress`
|
||||||
|
:param formula: See :func:`yli.regress`
|
||||||
|
:param nan_policy: See :func:`yli.regress`
|
||||||
|
:param model_kwargs: See :func:`yli.regress`
|
||||||
|
:param fit_kwargs: See :func:`yli.regress`
|
||||||
|
:param samples: Number of bootstrap samples to draw
|
||||||
|
:type samples: int
|
||||||
|
:param kwargs: Other arguments to pass to :func:`yli.regress`
|
||||||
|
|
||||||
|
:rtype: :class:`yli.regress.RegressionResult`
|
||||||
|
"""
|
||||||
|
|
||||||
|
if model_kwargs is None:
|
||||||
|
model_kwargs = {}
|
||||||
|
if fit_kwargs is None:
|
||||||
|
fit_kwargs = {}
|
||||||
|
|
||||||
|
# Check for/clean NaNs
|
||||||
|
# Following this, we pass nan_policy='raise' to assert no NaNs remaining
|
||||||
|
df = df[[dep] + cols_for_formula(formula, df)]
|
||||||
|
df = check_nan(df, nan_policy)
|
||||||
|
|
||||||
|
# Ensure numeric type for dependent variable
|
||||||
|
if df[dep].dtype != 'float64':
|
||||||
|
df[dep] = df[dep].astype('float64')
|
||||||
|
|
||||||
|
# Convert pandas nullable types for independent variables as this breaks statsmodels
|
||||||
|
df = convert_pandas_nullable(df)
|
||||||
|
|
||||||
|
# Precompute design matrices
|
||||||
|
dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe')
|
||||||
|
|
||||||
|
# Fit full model
|
||||||
|
full_model = regress(model_class, df, dep, formula, nan_policy='raise', _dmatrices=dmatrices, model_kwargs=model_kwargs, fit_kwargs=fit_kwargs, **kwargs)
|
||||||
|
|
||||||
|
# Initialise bootstrap_results
|
||||||
|
bootstrap_results = {} # Dict mapping term raw names to bootstrap betas
|
||||||
|
for term in full_model.terms.values():
|
||||||
|
if isinstance(term, SingleTerm):
|
||||||
|
bootstrap_results[term.raw_name] = []
|
||||||
|
else:
|
||||||
|
for sub_term in term.categories.values():
|
||||||
|
bootstrap_results[sub_term.raw_name] = []
|
||||||
|
|
||||||
|
# Draw bootstrap samples and regress
|
||||||
|
dmatrices = dmatrices[0].join(dmatrices[1])
|
||||||
|
|
||||||
|
for i in tqdm(range(samples)):
|
||||||
|
bootstrap_rows = dmatrices.sample(len(df), replace=True)
|
||||||
|
model = model_class(endog=bootstrap_rows.iloc[:,0], exog=bootstrap_rows.iloc[:,1:], **model_kwargs)
|
||||||
|
model.formula = dep + ' ~ ' + formula
|
||||||
|
|
||||||
|
result = model.fit(disp=False, **fit_kwargs)
|
||||||
|
|
||||||
|
for raw_name, raw_beta in zip(model.exog_names, result._results.params):
|
||||||
|
bootstrap_results[raw_name].append(raw_beta)
|
||||||
|
|
||||||
|
# Combine bootstrap results
|
||||||
|
terms = {}
|
||||||
|
for term_name, term in full_model.terms.items():
|
||||||
|
if isinstance(term, SingleTerm):
|
||||||
|
bootstrap_betas = bootstrap_results[term.raw_name]
|
||||||
|
bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
|
||||||
|
bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
|
||||||
|
terms[term_name] = SingleTerm(term.raw_name, Estimate(term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue)
|
||||||
|
else:
|
||||||
|
categories = {}
|
||||||
|
for sub_term_name, sub_term in term.categories.items():
|
||||||
|
bootstrap_betas = bootstrap_results[sub_term.raw_name]
|
||||||
|
bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
|
||||||
|
bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
|
||||||
|
categories[sub_term_name] = SingleTerm(sub_term.raw_name, Estimate(sub_term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue)
|
||||||
|
terms[term_name] = CategoricalTerm(categories, term.ref_category)
|
||||||
|
|
||||||
|
return RegressionResult(
|
||||||
|
None,
|
||||||
|
full_model.full_name, full_model.model_name, full_model.fit_method,
|
||||||
|
dep, full_model.nobs, full_model.dof_model, datetime.now(), 'Bootstrap',
|
||||||
|
terms,
|
||||||
|
full_model.llf, full_model.llnull,
|
||||||
|
full_model.dof_resid, full_model.rsquared, full_model.f_statistic,
|
||||||
|
full_model.exp
|
||||||
|
)
|
||||||
|
|
||||||
def logit_then_regress(model_class, df, dep, formula, *, nan_policy='warn', **kwargs):
|
def logit_then_regress(model_class, df, dep, formula, *, nan_policy='warn', **kwargs):
|
||||||
"""
|
"""
|
||||||
Perform logistic regression, then use parameters as start parameters for desired regression
|
Perform logistic regression, then use parameters as start parameters for desired regression
|
||||||
|
Loading…
Reference in New Issue
Block a user