Refactor bootstrap regression API as method of RegressionResult

2022-12-02 21:01:07 +11:00 · 2022-12-02 21:01:07 +11:00 · e4e85db354
commit e4e85db354
parent 4d59e1a521
2 changed files with 104 additions and 115 deletions
--- a/yli/init.py
+++ b/yli/init.py
@ -19,7 +19,7 @@ from .config import config
 from .descriptives import auto_descriptives
 from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
 from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
-from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, regress_bootstrap, vif
+from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, vif
 from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, ttest_ind

 def reload_me():
--- a/yli/regress.py
+++ b/yli/regress.py
@ -125,7 +125,7 @@ class RegressionResult:
 	"""
 	
 	def __init__(self,
-		model_class, df, dep, formula, regress_kwargs,
+		model_class, df, dep, formula, nan_policy, model_kwargs, fit_kwargs,
 		raw_result,
 		full_name, model_name, fit_method,
 		nobs, dof_model, fitted_dt, cov_type,
@ -136,16 +136,20 @@ class RegressionResult:
 		exp
 	):
 		# Data about how model fitted
-		#: statsmodels model class used for fitting the model
+		#: See :func:`yli.regress`
 		self.model_class = model_class
 		#: Data fitted (*weakref* to *DataFrame*)
 		self.df = df
-		#: Name of the dependent variable (*str*)
+		#: See :func:`yli.regress`
 		self.dep = dep
-		#: Formula for the model (*str*)
+		#: See :func:`yli.regress`
 		self.formula = formula
-		#: Other kwargs passed to :func:`yli.regress` (*dict*)
-		self.regress_kwargs = regress_kwargs
+		#: See :func:`yli.regress`
+		self.nan_policy = nan_policy
+		#: See :func:`yli.regress`
+		self.model_kwargs = model_kwargs
+		#: See :func:`yli.regress`
+		self.fit_kwargs = fit_kwargs
 		
 		#: Raw result from statsmodels *model.fit*
 		self.raw_result = raw_result
@ -260,9 +264,10 @@ class RegressionResult:
 		dep = self.dep
 		
 		# Check for/clean NaNs
+		# NaN warning/error will already have been handled in regress, so here we pass nan_policy='omit'
 		# Following this, we pass nan_policy='raise' to assert no NaNs remaining
 		df = df[[dep] + cols_for_formula(self.formula, df)]
-		df = check_nan(df, self.regress_kwargs['nan_policy'])
+		df = check_nan(df, 'omit')
 		
 		# Ensure numeric type for dependent variable
 		if df[dep].dtype != 'float64':
@ -280,7 +285,7 @@ class RegressionResult:
 		logit_models = []
 		for upper_limit in sorted(df[dep].unique())[:-1]:  # FIXME: Sort order
 			dep_dichotomous = (df[dep] <= upper_limit).astype(int).reset_index(drop=True)
-			logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False, **self.regress_kwargs.get('fit_kwargs', {}))
+			logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False, **self.fit_kwargs)
 			
 			if not logit_result.mle_retvals['converged']:
 				raise Exception('Maximum likelihood estimation failed to converge for {} <= {}. Check raw_result.mle_retvals.'.format(dep, upper_limit))
@ -346,6 +351,91 @@ class RegressionResult:
 		
 		return BrantResult(wald_results)
 	
+	def bootstrap(self, samples=1000):
+		"""
+		Fit a statsmodels regression model, using bootstrapping to compute confidence intervals and *p* values
+		
+		:param samples: Number of bootstrap samples to draw
+		:type samples: int
+		
+		:rtype: :class:`yli.regress.RegressionResult`
+		"""
+		
+		df = self.df()
+		if df is None:
+			raise Exception('Referenced DataFrame has been dropped')
+		dep = self.dep
+		
+		# Check for/clean NaNs
+		# NaN warning/error will already have been handled in regress, so here we pass nan_policy='omit'
+		# Following this, we pass nan_policy='raise' to assert no NaNs remaining
+		df = df[[dep] + cols_for_formula(self.formula, df)]
+		df = check_nan(df, 'omit')
+		
+		# Ensure numeric type for dependent variable
+		if df[dep].dtype != 'float64':
+			df[dep] = df[dep].astype('float64')
+		
+		# Convert pandas nullable types for independent variables as this breaks statsmodels
+		df = convert_pandas_nullable(df)
+		
+		# Precompute design matrices
+		dmatrices = patsy.dmatrices(dep + ' ~ ' + self.formula, df, return_type='dataframe')
+		
+		# Fit full model
+		#full_model = regress(self.model_class, df, dep, self.formula, nan_policy='raise', _dmatrices=dmatrices, model_kwargs=self.model_kwargs, fit_kwargs=self.fit_kwargs)
+		
+		# Initialise bootstrap_results
+		bootstrap_results = {}  # Dict mapping term raw names to bootstrap betas
+		for term in self.terms.values():
+			if isinstance(term, SingleTerm):
+				bootstrap_results[term.raw_name] = []
+			else:
+				for sub_term in term.categories.values():
+					bootstrap_results[sub_term.raw_name] = []
+		
+		# Draw bootstrap samples and regress
+		dmatrices = dmatrices[0].join(dmatrices[1])
+		
+		for i in tqdm(range(samples)):
+			bootstrap_rows = dmatrices.sample(len(df), replace=True)
+			model = self.model_class(endog=bootstrap_rows.iloc[:,0], exog=bootstrap_rows.iloc[:,1:], **self.model_kwargs)
+			model.formula = dep + ' ~ ' + self.formula
+			
+			result = model.fit(disp=False, **self.fit_kwargs)
+			
+			for raw_name, raw_beta in zip(model.exog_names, result._results.params):
+				bootstrap_results[raw_name].append(raw_beta)
+		
+		# Combine bootstrap results
+		terms = {}
+		for term_name, term in self.terms.items():
+			if isinstance(term, SingleTerm):
+				bootstrap_betas = bootstrap_results[term.raw_name]
+				bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
+				bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
+				terms[term_name] = SingleTerm(term.raw_name, Estimate(term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue)
+			else:
+				categories = {}
+				for sub_term_name, sub_term in term.categories.items():
+					bootstrap_betas = bootstrap_results[sub_term.raw_name]
+					bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
+					bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
+					categories[sub_term_name] = SingleTerm(sub_term.raw_name, Estimate(sub_term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue)
+				terms[term_name] = CategoricalTerm(categories, term.ref_category)
+		
+		return RegressionResult(
+			self.model_class, self.df, dep, self.formula, self.nan_policy, self.model_kwargs, self.fit_kwargs,
+			None,
+			self.full_name, self.model_name, self.fit_method,
+			self.nobs, self.dof_model, datetime.now(), 'Bootstrap',
+			terms,
+			self.llf, self.llnull,
+			self.dof_resid, self.rsquared, self.f_statistic,
+			self.comments,
+			self.exp
+		)
+	
 	def _header_table(self, html):
 		"""Return the entries for the header table"""
 		
@ -641,9 +731,6 @@ def regress(
 	if start_params is not None:
 		fit_kwargs['start_params'] = start_params
 	
-	# Collect kwargs to store in result later
-	regress_kwargs = dict(nan_policy=nan_policy, fit_kwargs=fit_kwargs)
-	
 	# Autodetect whether to exponentiate
 	if exp is None:
 		if model_class in (sm.Logit, sm.Poisson, PenalisedLogit):
@ -688,7 +775,9 @@ def regress(
 		result.df = df_ref
 		result.dep = dep
 		result.formula = formula
-		result.regress_kwargs = regress_kwargs
+		result.nan_policy = nan_policy
+		result.model_kwargs = model_kwargs
+		result.fit_kwargs = fit_kwargs
 		result.exp = exp
 		return result
 	
@ -770,7 +859,7 @@ def regress(
 		full_name = 'Robust {}'.format(full_name)
 	
 	return RegressionResult(
-		model_class, df_ref, dep, formula, regress_kwargs,
+		model_class, df_ref, dep, formula, nan_policy, model_kwargs, fit_kwargs,
 		result,
 		full_name, model_class.__name__, method_name,
 		result.nobs, result.df_model, datetime.now(), getattr(result, 'cov_type', 'nonrobust'),
@ -781,106 +870,6 @@ def regress(
 		exp
 	)

-def regress_bootstrap(
-	model_class, df, dep, formula, *,
-	nan_policy='warn',
-	model_kwargs=None, fit_kwargs=None,
-	samples=1000,
-	**kwargs
-):
-	"""
-	Fit a statsmodels regression model, using bootstrapping to compute confidence intervals and *p* values
-	
-	:param model_class: See :func:`yli.regress`
-	:param df: See :func:`yli.regress`
-	:param dep: See :func:`yli.regress`
-	:param formula: See :func:`yli.regress`
-	:param nan_policy: See :func:`yli.regress`
-	:param model_kwargs: See :func:`yli.regress`
-	:param fit_kwargs: See :func:`yli.regress`
-	:param samples: Number of bootstrap samples to draw
-	:type samples: int
-	:param kwargs: Other arguments to pass to :func:`yli.regress`
-	
-	:rtype: :class:`yli.regress.RegressionResult`
-	"""
-	
-	if model_kwargs is None:
-		model_kwargs = {}
-	if fit_kwargs is None:
-		fit_kwargs = {}
-	
-	df_ref = weakref.ref(df)
-	
-	# Check for/clean NaNs
-	# Following this, we pass nan_policy='raise' to assert no NaNs remaining
-	df = df[[dep] + cols_for_formula(formula, df)]
-	df = check_nan(df, nan_policy)
-	
-	# Ensure numeric type for dependent variable
-	if df[dep].dtype != 'float64':
-		df[dep] = df[dep].astype('float64')
-	
-	# Convert pandas nullable types for independent variables as this breaks statsmodels
-	df = convert_pandas_nullable(df)
-	
-	# Precompute design matrices
-	dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe')
-	
-	# Fit full model
-	full_model = regress(model_class, df, dep, formula, nan_policy='raise', _dmatrices=dmatrices, model_kwargs=model_kwargs, fit_kwargs=fit_kwargs, **kwargs)
-	
-	# Initialise bootstrap_results
-	bootstrap_results = {}  # Dict mapping term raw names to bootstrap betas
-	for term in full_model.terms.values():
-		if isinstance(term, SingleTerm):
-			bootstrap_results[term.raw_name] = []
-		else:
-			for sub_term in term.categories.values():
-				bootstrap_results[sub_term.raw_name] = []
-	
-	# Draw bootstrap samples and regress
-	dmatrices = dmatrices[0].join(dmatrices[1])
-	
-	for i in tqdm(range(samples)):
-		bootstrap_rows = dmatrices.sample(len(df), replace=True)
-		model = model_class(endog=bootstrap_rows.iloc[:,0], exog=bootstrap_rows.iloc[:,1:], **model_kwargs)
-		model.formula = dep + ' ~ ' + formula
-		
-		result = model.fit(disp=False, **fit_kwargs)
-		
-		for raw_name, raw_beta in zip(model.exog_names, result._results.params):
-			bootstrap_results[raw_name].append(raw_beta)
-	
-	# Combine bootstrap results
-	terms = {}
-	for term_name, term in full_model.terms.items():
-		if isinstance(term, SingleTerm):
-			bootstrap_betas = bootstrap_results[term.raw_name]
-			bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
-			bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
-			terms[term_name] = SingleTerm(term.raw_name, Estimate(term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue)
-		else:
-			categories = {}
-			for sub_term_name, sub_term in term.categories.items():
-				bootstrap_betas = bootstrap_results[sub_term.raw_name]
-				bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
-				bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
-				categories[sub_term_name] = SingleTerm(sub_term.raw_name, Estimate(sub_term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue)
-			terms[term_name] = CategoricalTerm(categories, term.ref_category)
-	
-	return RegressionResult(
-		model_class, df_ref, dep, formula, full_model.regress_kwargs,
-		None,
-		full_model.full_name, full_model.model_name, full_model.fit_method,
-		dep, full_model.nobs, full_model.dof_model, datetime.now(), 'Bootstrap',
-		terms,
-		full_model.llf, full_model.llnull,
-		full_model.dof_resid, full_model.rsquared, full_model.f_statistic,
-		full_model.comments,
-		full_model.exp
-	)
-
 def logit_then_regress(model_class, df, dep, formula, *, nan_policy='warn', **kwargs):
 	"""
 	Perform logistic regression, then use parameters as start parameters for desired regression
@ -989,7 +978,7 @@ class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel):
 				terms = {t: SingleTerm(t, Estimate(b, ci0, ci1), p) for t, b, ci0, ci1, p in zip(model['terms'], model['coefficients'], model['ci.lower'], model['ci.upper'], model['prob'])}
 				
 				return RegressionResult(
-					None, None, None, None, None,  # Set in regress()
+					None, None, None, None, None, None, None,  # Set in regress()
 					model,
 					'Penalised Logistic Regression', 'Logit', 'Penalised ML',
 					model['n'][0], model['df'][0], datetime.now(), 'nonrobust',