Refactor bootstrap regression API as method of RegressionResult
This commit is contained in:
		
							parent
							
								
									4d59e1a521
								
							
						
					
					
						commit
						e4e85db354
					
				| @ -19,7 +19,7 @@ from .config import config | ||||
| from .descriptives import auto_descriptives | ||||
| from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist | ||||
| from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted | ||||
| from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, regress_bootstrap, vif | ||||
| from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, vif | ||||
| from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, ttest_ind | ||||
| 
 | ||||
| def reload_me(): | ||||
|  | ||||
							
								
								
									
										217
									
								
								yli/regress.py
									
									
									
									
									
								
							
							
						
						
									
										217
									
								
								yli/regress.py
									
									
									
									
									
								
							| @ -125,7 +125,7 @@ class RegressionResult: | ||||
| 	""" | ||||
| 	 | ||||
| 	def __init__(self, | ||||
| 		model_class, df, dep, formula, regress_kwargs, | ||||
| 		model_class, df, dep, formula, nan_policy, model_kwargs, fit_kwargs, | ||||
| 		raw_result, | ||||
| 		full_name, model_name, fit_method, | ||||
| 		nobs, dof_model, fitted_dt, cov_type, | ||||
| @ -136,16 +136,20 @@ class RegressionResult: | ||||
| 		exp | ||||
| 	): | ||||
| 		# Data about how model fitted | ||||
| 		#: statsmodels model class used for fitting the model | ||||
| 		#: See :func:`yli.regress` | ||||
| 		self.model_class = model_class | ||||
| 		#: Data fitted (*weakref* to *DataFrame*) | ||||
| 		self.df = df | ||||
| 		#: Name of the dependent variable (*str*) | ||||
| 		#: See :func:`yli.regress` | ||||
| 		self.dep = dep | ||||
| 		#: Formula for the model (*str*) | ||||
| 		#: See :func:`yli.regress` | ||||
| 		self.formula = formula | ||||
| 		#: Other kwargs passed to :func:`yli.regress` (*dict*) | ||||
| 		self.regress_kwargs = regress_kwargs | ||||
| 		#: See :func:`yli.regress` | ||||
| 		self.nan_policy = nan_policy | ||||
| 		#: See :func:`yli.regress` | ||||
| 		self.model_kwargs = model_kwargs | ||||
| 		#: See :func:`yli.regress` | ||||
| 		self.fit_kwargs = fit_kwargs | ||||
| 		 | ||||
| 		#: Raw result from statsmodels *model.fit* | ||||
| 		self.raw_result = raw_result | ||||
| @ -260,9 +264,10 @@ class RegressionResult: | ||||
| 		dep = self.dep | ||||
| 		 | ||||
| 		# Check for/clean NaNs | ||||
| 		# NaN warning/error will already have been handled in regress, so here we pass nan_policy='omit' | ||||
| 		# Following this, we pass nan_policy='raise' to assert no NaNs remaining | ||||
| 		df = df[[dep] + cols_for_formula(self.formula, df)] | ||||
| 		df = check_nan(df, self.regress_kwargs['nan_policy']) | ||||
| 		df = check_nan(df, 'omit') | ||||
| 		 | ||||
| 		# Ensure numeric type for dependent variable | ||||
| 		if df[dep].dtype != 'float64': | ||||
| @ -280,7 +285,7 @@ class RegressionResult: | ||||
| 		logit_models = [] | ||||
| 		for upper_limit in sorted(df[dep].unique())[:-1]:  # FIXME: Sort order | ||||
| 			dep_dichotomous = (df[dep] <= upper_limit).astype(int).reset_index(drop=True) | ||||
| 			logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False, **self.regress_kwargs.get('fit_kwargs', {})) | ||||
| 			logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False, **self.fit_kwargs) | ||||
| 			 | ||||
| 			if not logit_result.mle_retvals['converged']: | ||||
| 				raise Exception('Maximum likelihood estimation failed to converge for {} <= {}. Check raw_result.mle_retvals.'.format(dep, upper_limit)) | ||||
| @ -346,6 +351,91 @@ class RegressionResult: | ||||
| 		 | ||||
| 		return BrantResult(wald_results) | ||||
| 	 | ||||
| 	def bootstrap(self, samples=1000): | ||||
| 		""" | ||||
| 		Fit a statsmodels regression model, using bootstrapping to compute confidence intervals and *p* values | ||||
| 		 | ||||
| 		:param samples: Number of bootstrap samples to draw | ||||
| 		:type samples: int | ||||
| 		 | ||||
| 		:rtype: :class:`yli.regress.RegressionResult` | ||||
| 		""" | ||||
| 		 | ||||
| 		df = self.df() | ||||
| 		if df is None: | ||||
| 			raise Exception('Referenced DataFrame has been dropped') | ||||
| 		dep = self.dep | ||||
| 		 | ||||
| 		# Check for/clean NaNs | ||||
| 		# NaN warning/error will already have been handled in regress, so here we pass nan_policy='omit' | ||||
| 		# Following this, we pass nan_policy='raise' to assert no NaNs remaining | ||||
| 		df = df[[dep] + cols_for_formula(self.formula, df)] | ||||
| 		df = check_nan(df, 'omit') | ||||
| 		 | ||||
| 		# Ensure numeric type for dependent variable | ||||
| 		if df[dep].dtype != 'float64': | ||||
| 			df[dep] = df[dep].astype('float64') | ||||
| 		 | ||||
| 		# Convert pandas nullable types for independent variables as this breaks statsmodels | ||||
| 		df = convert_pandas_nullable(df) | ||||
| 		 | ||||
| 		# Precompute design matrices | ||||
| 		dmatrices = patsy.dmatrices(dep + ' ~ ' + self.formula, df, return_type='dataframe') | ||||
| 		 | ||||
| 		# Fit full model | ||||
| 		#full_model = regress(self.model_class, df, dep, self.formula, nan_policy='raise', _dmatrices=dmatrices, model_kwargs=self.model_kwargs, fit_kwargs=self.fit_kwargs) | ||||
| 		 | ||||
| 		# Initialise bootstrap_results | ||||
| 		bootstrap_results = {}  # Dict mapping term raw names to bootstrap betas | ||||
| 		for term in self.terms.values(): | ||||
| 			if isinstance(term, SingleTerm): | ||||
| 				bootstrap_results[term.raw_name] = [] | ||||
| 			else: | ||||
| 				for sub_term in term.categories.values(): | ||||
| 					bootstrap_results[sub_term.raw_name] = [] | ||||
| 		 | ||||
| 		# Draw bootstrap samples and regress | ||||
| 		dmatrices = dmatrices[0].join(dmatrices[1]) | ||||
| 		 | ||||
| 		for i in tqdm(range(samples)): | ||||
| 			bootstrap_rows = dmatrices.sample(len(df), replace=True) | ||||
| 			model = self.model_class(endog=bootstrap_rows.iloc[:,0], exog=bootstrap_rows.iloc[:,1:], **self.model_kwargs) | ||||
| 			model.formula = dep + ' ~ ' + self.formula | ||||
| 			 | ||||
| 			result = model.fit(disp=False, **self.fit_kwargs) | ||||
| 			 | ||||
| 			for raw_name, raw_beta in zip(model.exog_names, result._results.params): | ||||
| 				bootstrap_results[raw_name].append(raw_beta) | ||||
| 		 | ||||
| 		# Combine bootstrap results | ||||
| 		terms = {} | ||||
| 		for term_name, term in self.terms.items(): | ||||
| 			if isinstance(term, SingleTerm): | ||||
| 				bootstrap_betas = bootstrap_results[term.raw_name] | ||||
| 				bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas) | ||||
| 				bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue) | ||||
| 				terms[term_name] = SingleTerm(term.raw_name, Estimate(term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue) | ||||
| 			else: | ||||
| 				categories = {} | ||||
| 				for sub_term_name, sub_term in term.categories.items(): | ||||
| 					bootstrap_betas = bootstrap_results[sub_term.raw_name] | ||||
| 					bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas) | ||||
| 					bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue) | ||||
| 					categories[sub_term_name] = SingleTerm(sub_term.raw_name, Estimate(sub_term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue) | ||||
| 				terms[term_name] = CategoricalTerm(categories, term.ref_category) | ||||
| 		 | ||||
| 		return RegressionResult( | ||||
| 			self.model_class, self.df, dep, self.formula, self.nan_policy, self.model_kwargs, self.fit_kwargs, | ||||
| 			None, | ||||
| 			self.full_name, self.model_name, self.fit_method, | ||||
| 			self.nobs, self.dof_model, datetime.now(), 'Bootstrap', | ||||
| 			terms, | ||||
| 			self.llf, self.llnull, | ||||
| 			self.dof_resid, self.rsquared, self.f_statistic, | ||||
| 			self.comments, | ||||
| 			self.exp | ||||
| 		) | ||||
| 	 | ||||
| 	def _header_table(self, html): | ||||
| 		"""Return the entries for the header table""" | ||||
| 		 | ||||
| @ -641,9 +731,6 @@ def regress( | ||||
| 	if start_params is not None: | ||||
| 		fit_kwargs['start_params'] = start_params | ||||
| 	 | ||||
| 	# Collect kwargs to store in result later | ||||
| 	regress_kwargs = dict(nan_policy=nan_policy, fit_kwargs=fit_kwargs) | ||||
| 	 | ||||
| 	# Autodetect whether to exponentiate | ||||
| 	if exp is None: | ||||
| 		if model_class in (sm.Logit, sm.Poisson, PenalisedLogit): | ||||
| @ -688,7 +775,9 @@ def regress( | ||||
| 		result.df = df_ref | ||||
| 		result.dep = dep | ||||
| 		result.formula = formula | ||||
| 		result.regress_kwargs = regress_kwargs | ||||
| 		result.nan_policy = nan_policy | ||||
| 		result.model_kwargs = model_kwargs | ||||
| 		result.fit_kwargs = fit_kwargs | ||||
| 		result.exp = exp | ||||
| 		return result | ||||
| 	 | ||||
| @ -770,7 +859,7 @@ def regress( | ||||
| 		full_name = 'Robust {}'.format(full_name) | ||||
| 	 | ||||
| 	return RegressionResult( | ||||
| 		model_class, df_ref, dep, formula, regress_kwargs, | ||||
| 		model_class, df_ref, dep, formula, nan_policy, model_kwargs, fit_kwargs, | ||||
| 		result, | ||||
| 		full_name, model_class.__name__, method_name, | ||||
| 		result.nobs, result.df_model, datetime.now(), getattr(result, 'cov_type', 'nonrobust'), | ||||
| @ -781,106 +870,6 @@ def regress( | ||||
| 		exp | ||||
| 	) | ||||
| 
 | ||||
| def regress_bootstrap( | ||||
| 	model_class, df, dep, formula, *, | ||||
| 	nan_policy='warn', | ||||
| 	model_kwargs=None, fit_kwargs=None, | ||||
| 	samples=1000, | ||||
| 	**kwargs | ||||
| ): | ||||
| 	""" | ||||
| 	Fit a statsmodels regression model, using bootstrapping to compute confidence intervals and *p* values | ||||
| 	 | ||||
| 	:param model_class: See :func:`yli.regress` | ||||
| 	:param df: See :func:`yli.regress` | ||||
| 	:param dep: See :func:`yli.regress` | ||||
| 	:param formula: See :func:`yli.regress` | ||||
| 	:param nan_policy: See :func:`yli.regress` | ||||
| 	:param model_kwargs: See :func:`yli.regress` | ||||
| 	:param fit_kwargs: See :func:`yli.regress` | ||||
| 	:param samples: Number of bootstrap samples to draw | ||||
| 	:type samples: int | ||||
| 	:param kwargs: Other arguments to pass to :func:`yli.regress` | ||||
| 	 | ||||
| 	:rtype: :class:`yli.regress.RegressionResult` | ||||
| 	""" | ||||
| 	 | ||||
| 	if model_kwargs is None: | ||||
| 		model_kwargs = {} | ||||
| 	if fit_kwargs is None: | ||||
| 		fit_kwargs = {} | ||||
| 	 | ||||
| 	df_ref = weakref.ref(df) | ||||
| 	 | ||||
| 	# Check for/clean NaNs | ||||
| 	# Following this, we pass nan_policy='raise' to assert no NaNs remaining | ||||
| 	df = df[[dep] + cols_for_formula(formula, df)] | ||||
| 	df = check_nan(df, nan_policy) | ||||
| 	 | ||||
| 	# Ensure numeric type for dependent variable | ||||
| 	if df[dep].dtype != 'float64': | ||||
| 		df[dep] = df[dep].astype('float64') | ||||
| 	 | ||||
| 	# Convert pandas nullable types for independent variables as this breaks statsmodels | ||||
| 	df = convert_pandas_nullable(df) | ||||
| 	 | ||||
| 	# Precompute design matrices | ||||
| 	dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe') | ||||
| 	 | ||||
| 	# Fit full model | ||||
| 	full_model = regress(model_class, df, dep, formula, nan_policy='raise', _dmatrices=dmatrices, model_kwargs=model_kwargs, fit_kwargs=fit_kwargs, **kwargs) | ||||
| 	 | ||||
| 	# Initialise bootstrap_results | ||||
| 	bootstrap_results = {}  # Dict mapping term raw names to bootstrap betas | ||||
| 	for term in full_model.terms.values(): | ||||
| 		if isinstance(term, SingleTerm): | ||||
| 			bootstrap_results[term.raw_name] = [] | ||||
| 		else: | ||||
| 			for sub_term in term.categories.values(): | ||||
| 				bootstrap_results[sub_term.raw_name] = [] | ||||
| 	 | ||||
| 	# Draw bootstrap samples and regress | ||||
| 	dmatrices = dmatrices[0].join(dmatrices[1]) | ||||
| 	 | ||||
| 	for i in tqdm(range(samples)): | ||||
| 		bootstrap_rows = dmatrices.sample(len(df), replace=True) | ||||
| 		model = model_class(endog=bootstrap_rows.iloc[:,0], exog=bootstrap_rows.iloc[:,1:], **model_kwargs) | ||||
| 		model.formula = dep + ' ~ ' + formula | ||||
| 		 | ||||
| 		result = model.fit(disp=False, **fit_kwargs) | ||||
| 		 | ||||
| 		for raw_name, raw_beta in zip(model.exog_names, result._results.params): | ||||
| 			bootstrap_results[raw_name].append(raw_beta) | ||||
| 	 | ||||
| 	# Combine bootstrap results | ||||
| 	terms = {} | ||||
| 	for term_name, term in full_model.terms.items(): | ||||
| 		if isinstance(term, SingleTerm): | ||||
| 			bootstrap_betas = bootstrap_results[term.raw_name] | ||||
| 			bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas) | ||||
| 			bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue) | ||||
| 			terms[term_name] = SingleTerm(term.raw_name, Estimate(term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue) | ||||
| 		else: | ||||
| 			categories = {} | ||||
| 			for sub_term_name, sub_term in term.categories.items(): | ||||
| 				bootstrap_betas = bootstrap_results[sub_term.raw_name] | ||||
| 				bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas) | ||||
| 				bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue) | ||||
| 				categories[sub_term_name] = SingleTerm(sub_term.raw_name, Estimate(sub_term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue) | ||||
| 			terms[term_name] = CategoricalTerm(categories, term.ref_category) | ||||
| 	 | ||||
| 	return RegressionResult( | ||||
| 		model_class, df_ref, dep, formula, full_model.regress_kwargs, | ||||
| 		None, | ||||
| 		full_model.full_name, full_model.model_name, full_model.fit_method, | ||||
| 		dep, full_model.nobs, full_model.dof_model, datetime.now(), 'Bootstrap', | ||||
| 		terms, | ||||
| 		full_model.llf, full_model.llnull, | ||||
| 		full_model.dof_resid, full_model.rsquared, full_model.f_statistic, | ||||
| 		full_model.comments, | ||||
| 		full_model.exp | ||||
| 	) | ||||
| 
 | ||||
| def logit_then_regress(model_class, df, dep, formula, *, nan_policy='warn', **kwargs): | ||||
| 	""" | ||||
| 	Perform logistic regression, then use parameters as start parameters for desired regression | ||||
| @ -989,7 +978,7 @@ class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel): | ||||
| 				terms = {t: SingleTerm(t, Estimate(b, ci0, ci1), p) for t, b, ci0, ci1, p in zip(model['terms'], model['coefficients'], model['ci.lower'], model['ci.upper'], model['prob'])} | ||||
| 				 | ||||
| 				return RegressionResult( | ||||
| 					None, None, None, None, None,  # Set in regress() | ||||
| 					None, None, None, None, None, None, None,  # Set in regress() | ||||
| 					model, | ||||
| 					'Penalised Logistic Regression', 'Logit', 'Penalised ML', | ||||
| 					model['n'][0], model['df'][0], datetime.now(), 'nonrobust', | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user