scipy-yli/yli/bootstrap.py

#   scipy-yli: Helpful SciPy utilities and recipes
#   Copyright © 2022  Lee Yingtong Li (RunasSudo)
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.

import numpy as np
import pandas as pd
import patsy
from tqdm import tqdm

from datetime import datetime

from .regress import CategoricalTerm, RegressionResult, SingleTerm, regress
from .utils import Estimate, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term

def bootstrap_regress(
	model_class, df, dep, formula, *,
	nan_policy='warn',
	samples=1000,
	**kwargs
):
	"""
	Fit a statsmodels regression model, using bootstrapping to compute confidence intervals and *p* values

	:param model_class: See :func:`yli.regress`
	:param df: See :func:`yli.regress`
	:param dep: See :func:`yli.regress`
	:param formula: See :func:`yli.regress`
	:param nan_policy: See :func:`yli.regress`
	:param samples: Number of bootstrap samples to draw
	:type samples: int
	:param kwargs: See :func:`yli.regress`

	:rtype: :class:`yli.regress.RegressionResult`
	"""

	# Check for/clean NaNs
	# Following this, we pass nan_policy='raise' to assert no NaNs remaining
	df = df[[dep] + cols_for_formula(formula, df)]
	df = check_nan(df, nan_policy)

	# Ensure numeric type for dependent variable
	if df[dep].dtype != 'float64':
		df[dep] = df[dep].astype('float64')

	# Convert pandas nullable types for independent variables as this breaks statsmodels
	df = convert_pandas_nullable(df)

	# Precompute design matrices
	dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe')

	# Fit full model
	full_model = regress(model_class, df, dep, formula, nan_policy='raise', _dmatrices=dmatrices, **kwargs)

	# Cache reference categories
	ref_categories = {term_name: term.ref_category for term_name, term in full_model.terms.items() if isinstance(term, CategoricalTerm)}

	# Draw bootstrap samples and regress
	bootstrap_results = []
	for i in tqdm(range(samples)):
		#bootstrap_sample = df.sample(len(df), replace=True)
		#bootstrap_results.append(regress(model_class, bootstrap_sample, dep, formula, nan_policy='raise', _dmatrices=dmatrices, _ref_categories=ref_categories, **kwargs))
		bootstrap_rows = pd.Series(dmatrices[0].index).sample(len(df), replace=True)
		bootstrap_dmatrices = (dmatrices[0].loc[bootstrap_rows], dmatrices[1].loc[bootstrap_rows])
		bootstrap_results.append(regress(model_class, df, dep, formula, nan_policy='raise', _dmatrices=bootstrap_dmatrices, _ref_categories=ref_categories, **kwargs))

	# Combine bootstrap results
	terms = {}
	for term_name, term in full_model.terms.items():
		if isinstance(term, SingleTerm):
			bootstrap_betas = [r.terms[term_name].beta.point for r in bootstrap_results]
			bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
			bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
			terms[term_name] = SingleTerm(term.raw_name, Estimate(term.beta.point, np.quantile(bootstrap_betas, 0.025), np.quantile(bootstrap_betas, 0.975)), bootstrap_pvalue)
		else:
			categories = {}
			for sub_term_name, sub_term in term.categories.items():
				bootstrap_betas = [r.terms[term_name].categories[sub_term_name].beta.point for r in bootstrap_results if sub_term_name in r.terms[term_name].categories]
				bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
				bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
				categories[sub_term_name] = SingleTerm(sub_term.raw_name, Estimate(sub_term.beta.point, np.quantile(bootstrap_betas, 0.025), np.quantile(bootstrap_betas, 0.975)), bootstrap_pvalue)
			terms[term_name] = CategoricalTerm(categories, term.ref_category)

	return RegressionResult(
		None,
		full_model.full_name, full_model.model_name, full_model.fit_method,
		dep, full_model.nobs, full_model.dof_model, datetime.now(), full_model.cov_type,
		terms,
		full_model.llf, full_model.llnull,
		full_model.dof_resid, full_model.rsquared, full_model.f_statistic,
		full_model.exp
	)