Refactor bootstrap regression API as method of RegressionResult
This commit is contained in:
parent
4d59e1a521
commit
e4e85db354
@ -19,7 +19,7 @@ from .config import config
|
|||||||
from .descriptives import auto_descriptives
|
from .descriptives import auto_descriptives
|
||||||
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
|
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
|
||||||
from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
|
from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
|
||||||
from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, regress_bootstrap, vif
|
from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, vif
|
||||||
from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, ttest_ind
|
from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, ttest_ind
|
||||||
|
|
||||||
def reload_me():
|
def reload_me():
|
||||||
|
217
yli/regress.py
217
yli/regress.py
@ -125,7 +125,7 @@ class RegressionResult:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
model_class, df, dep, formula, regress_kwargs,
|
model_class, df, dep, formula, nan_policy, model_kwargs, fit_kwargs,
|
||||||
raw_result,
|
raw_result,
|
||||||
full_name, model_name, fit_method,
|
full_name, model_name, fit_method,
|
||||||
nobs, dof_model, fitted_dt, cov_type,
|
nobs, dof_model, fitted_dt, cov_type,
|
||||||
@ -136,16 +136,20 @@ class RegressionResult:
|
|||||||
exp
|
exp
|
||||||
):
|
):
|
||||||
# Data about how model fitted
|
# Data about how model fitted
|
||||||
#: statsmodels model class used for fitting the model
|
#: See :func:`yli.regress`
|
||||||
self.model_class = model_class
|
self.model_class = model_class
|
||||||
#: Data fitted (*weakref* to *DataFrame*)
|
#: Data fitted (*weakref* to *DataFrame*)
|
||||||
self.df = df
|
self.df = df
|
||||||
#: Name of the dependent variable (*str*)
|
#: See :func:`yli.regress`
|
||||||
self.dep = dep
|
self.dep = dep
|
||||||
#: Formula for the model (*str*)
|
#: See :func:`yli.regress`
|
||||||
self.formula = formula
|
self.formula = formula
|
||||||
#: Other kwargs passed to :func:`yli.regress` (*dict*)
|
#: See :func:`yli.regress`
|
||||||
self.regress_kwargs = regress_kwargs
|
self.nan_policy = nan_policy
|
||||||
|
#: See :func:`yli.regress`
|
||||||
|
self.model_kwargs = model_kwargs
|
||||||
|
#: See :func:`yli.regress`
|
||||||
|
self.fit_kwargs = fit_kwargs
|
||||||
|
|
||||||
#: Raw result from statsmodels *model.fit*
|
#: Raw result from statsmodels *model.fit*
|
||||||
self.raw_result = raw_result
|
self.raw_result = raw_result
|
||||||
@ -260,9 +264,10 @@ class RegressionResult:
|
|||||||
dep = self.dep
|
dep = self.dep
|
||||||
|
|
||||||
# Check for/clean NaNs
|
# Check for/clean NaNs
|
||||||
|
# NaN warning/error will already have been handled in regress, so here we pass nan_policy='omit'
|
||||||
# Following this, we pass nan_policy='raise' to assert no NaNs remaining
|
# Following this, we pass nan_policy='raise' to assert no NaNs remaining
|
||||||
df = df[[dep] + cols_for_formula(self.formula, df)]
|
df = df[[dep] + cols_for_formula(self.formula, df)]
|
||||||
df = check_nan(df, self.regress_kwargs['nan_policy'])
|
df = check_nan(df, 'omit')
|
||||||
|
|
||||||
# Ensure numeric type for dependent variable
|
# Ensure numeric type for dependent variable
|
||||||
if df[dep].dtype != 'float64':
|
if df[dep].dtype != 'float64':
|
||||||
@ -280,7 +285,7 @@ class RegressionResult:
|
|||||||
logit_models = []
|
logit_models = []
|
||||||
for upper_limit in sorted(df[dep].unique())[:-1]: # FIXME: Sort order
|
for upper_limit in sorted(df[dep].unique())[:-1]: # FIXME: Sort order
|
||||||
dep_dichotomous = (df[dep] <= upper_limit).astype(int).reset_index(drop=True)
|
dep_dichotomous = (df[dep] <= upper_limit).astype(int).reset_index(drop=True)
|
||||||
logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False, **self.regress_kwargs.get('fit_kwargs', {}))
|
logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False, **self.fit_kwargs)
|
||||||
|
|
||||||
if not logit_result.mle_retvals['converged']:
|
if not logit_result.mle_retvals['converged']:
|
||||||
raise Exception('Maximum likelihood estimation failed to converge for {} <= {}. Check raw_result.mle_retvals.'.format(dep, upper_limit))
|
raise Exception('Maximum likelihood estimation failed to converge for {} <= {}. Check raw_result.mle_retvals.'.format(dep, upper_limit))
|
||||||
@ -346,6 +351,91 @@ class RegressionResult:
|
|||||||
|
|
||||||
return BrantResult(wald_results)
|
return BrantResult(wald_results)
|
||||||
|
|
||||||
|
def bootstrap(self, samples=1000):
|
||||||
|
"""
|
||||||
|
Fit a statsmodels regression model, using bootstrapping to compute confidence intervals and *p* values
|
||||||
|
|
||||||
|
:param samples: Number of bootstrap samples to draw
|
||||||
|
:type samples: int
|
||||||
|
|
||||||
|
:rtype: :class:`yli.regress.RegressionResult`
|
||||||
|
"""
|
||||||
|
|
||||||
|
df = self.df()
|
||||||
|
if df is None:
|
||||||
|
raise Exception('Referenced DataFrame has been dropped')
|
||||||
|
dep = self.dep
|
||||||
|
|
||||||
|
# Check for/clean NaNs
|
||||||
|
# NaN warning/error will already have been handled in regress, so here we pass nan_policy='omit'
|
||||||
|
# Following this, we pass nan_policy='raise' to assert no NaNs remaining
|
||||||
|
df = df[[dep] + cols_for_formula(self.formula, df)]
|
||||||
|
df = check_nan(df, 'omit')
|
||||||
|
|
||||||
|
# Ensure numeric type for dependent variable
|
||||||
|
if df[dep].dtype != 'float64':
|
||||||
|
df[dep] = df[dep].astype('float64')
|
||||||
|
|
||||||
|
# Convert pandas nullable types for independent variables as this breaks statsmodels
|
||||||
|
df = convert_pandas_nullable(df)
|
||||||
|
|
||||||
|
# Precompute design matrices
|
||||||
|
dmatrices = patsy.dmatrices(dep + ' ~ ' + self.formula, df, return_type='dataframe')
|
||||||
|
|
||||||
|
# Fit full model
|
||||||
|
#full_model = regress(self.model_class, df, dep, self.formula, nan_policy='raise', _dmatrices=dmatrices, model_kwargs=self.model_kwargs, fit_kwargs=self.fit_kwargs)
|
||||||
|
|
||||||
|
# Initialise bootstrap_results
|
||||||
|
bootstrap_results = {} # Dict mapping term raw names to bootstrap betas
|
||||||
|
for term in self.terms.values():
|
||||||
|
if isinstance(term, SingleTerm):
|
||||||
|
bootstrap_results[term.raw_name] = []
|
||||||
|
else:
|
||||||
|
for sub_term in term.categories.values():
|
||||||
|
bootstrap_results[sub_term.raw_name] = []
|
||||||
|
|
||||||
|
# Draw bootstrap samples and regress
|
||||||
|
dmatrices = dmatrices[0].join(dmatrices[1])
|
||||||
|
|
||||||
|
for i in tqdm(range(samples)):
|
||||||
|
bootstrap_rows = dmatrices.sample(len(df), replace=True)
|
||||||
|
model = self.model_class(endog=bootstrap_rows.iloc[:,0], exog=bootstrap_rows.iloc[:,1:], **self.model_kwargs)
|
||||||
|
model.formula = dep + ' ~ ' + self.formula
|
||||||
|
|
||||||
|
result = model.fit(disp=False, **self.fit_kwargs)
|
||||||
|
|
||||||
|
for raw_name, raw_beta in zip(model.exog_names, result._results.params):
|
||||||
|
bootstrap_results[raw_name].append(raw_beta)
|
||||||
|
|
||||||
|
# Combine bootstrap results
|
||||||
|
terms = {}
|
||||||
|
for term_name, term in self.terms.items():
|
||||||
|
if isinstance(term, SingleTerm):
|
||||||
|
bootstrap_betas = bootstrap_results[term.raw_name]
|
||||||
|
bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
|
||||||
|
bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
|
||||||
|
terms[term_name] = SingleTerm(term.raw_name, Estimate(term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue)
|
||||||
|
else:
|
||||||
|
categories = {}
|
||||||
|
for sub_term_name, sub_term in term.categories.items():
|
||||||
|
bootstrap_betas = bootstrap_results[sub_term.raw_name]
|
||||||
|
bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
|
||||||
|
bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
|
||||||
|
categories[sub_term_name] = SingleTerm(sub_term.raw_name, Estimate(sub_term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue)
|
||||||
|
terms[term_name] = CategoricalTerm(categories, term.ref_category)
|
||||||
|
|
||||||
|
return RegressionResult(
|
||||||
|
self.model_class, self.df, dep, self.formula, self.nan_policy, self.model_kwargs, self.fit_kwargs,
|
||||||
|
None,
|
||||||
|
self.full_name, self.model_name, self.fit_method,
|
||||||
|
self.nobs, self.dof_model, datetime.now(), 'Bootstrap',
|
||||||
|
terms,
|
||||||
|
self.llf, self.llnull,
|
||||||
|
self.dof_resid, self.rsquared, self.f_statistic,
|
||||||
|
self.comments,
|
||||||
|
self.exp
|
||||||
|
)
|
||||||
|
|
||||||
def _header_table(self, html):
|
def _header_table(self, html):
|
||||||
"""Return the entries for the header table"""
|
"""Return the entries for the header table"""
|
||||||
|
|
||||||
@ -641,9 +731,6 @@ def regress(
|
|||||||
if start_params is not None:
|
if start_params is not None:
|
||||||
fit_kwargs['start_params'] = start_params
|
fit_kwargs['start_params'] = start_params
|
||||||
|
|
||||||
# Collect kwargs to store in result later
|
|
||||||
regress_kwargs = dict(nan_policy=nan_policy, fit_kwargs=fit_kwargs)
|
|
||||||
|
|
||||||
# Autodetect whether to exponentiate
|
# Autodetect whether to exponentiate
|
||||||
if exp is None:
|
if exp is None:
|
||||||
if model_class in (sm.Logit, sm.Poisson, PenalisedLogit):
|
if model_class in (sm.Logit, sm.Poisson, PenalisedLogit):
|
||||||
@ -688,7 +775,9 @@ def regress(
|
|||||||
result.df = df_ref
|
result.df = df_ref
|
||||||
result.dep = dep
|
result.dep = dep
|
||||||
result.formula = formula
|
result.formula = formula
|
||||||
result.regress_kwargs = regress_kwargs
|
result.nan_policy = nan_policy
|
||||||
|
result.model_kwargs = model_kwargs
|
||||||
|
result.fit_kwargs = fit_kwargs
|
||||||
result.exp = exp
|
result.exp = exp
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@ -770,7 +859,7 @@ def regress(
|
|||||||
full_name = 'Robust {}'.format(full_name)
|
full_name = 'Robust {}'.format(full_name)
|
||||||
|
|
||||||
return RegressionResult(
|
return RegressionResult(
|
||||||
model_class, df_ref, dep, formula, regress_kwargs,
|
model_class, df_ref, dep, formula, nan_policy, model_kwargs, fit_kwargs,
|
||||||
result,
|
result,
|
||||||
full_name, model_class.__name__, method_name,
|
full_name, model_class.__name__, method_name,
|
||||||
result.nobs, result.df_model, datetime.now(), getattr(result, 'cov_type', 'nonrobust'),
|
result.nobs, result.df_model, datetime.now(), getattr(result, 'cov_type', 'nonrobust'),
|
||||||
@ -781,106 +870,6 @@ def regress(
|
|||||||
exp
|
exp
|
||||||
)
|
)
|
||||||
|
|
||||||
def regress_bootstrap(
|
|
||||||
model_class, df, dep, formula, *,
|
|
||||||
nan_policy='warn',
|
|
||||||
model_kwargs=None, fit_kwargs=None,
|
|
||||||
samples=1000,
|
|
||||||
**kwargs
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Fit a statsmodels regression model, using bootstrapping to compute confidence intervals and *p* values
|
|
||||||
|
|
||||||
:param model_class: See :func:`yli.regress`
|
|
||||||
:param df: See :func:`yli.regress`
|
|
||||||
:param dep: See :func:`yli.regress`
|
|
||||||
:param formula: See :func:`yli.regress`
|
|
||||||
:param nan_policy: See :func:`yli.regress`
|
|
||||||
:param model_kwargs: See :func:`yli.regress`
|
|
||||||
:param fit_kwargs: See :func:`yli.regress`
|
|
||||||
:param samples: Number of bootstrap samples to draw
|
|
||||||
:type samples: int
|
|
||||||
:param kwargs: Other arguments to pass to :func:`yli.regress`
|
|
||||||
|
|
||||||
:rtype: :class:`yli.regress.RegressionResult`
|
|
||||||
"""
|
|
||||||
|
|
||||||
if model_kwargs is None:
|
|
||||||
model_kwargs = {}
|
|
||||||
if fit_kwargs is None:
|
|
||||||
fit_kwargs = {}
|
|
||||||
|
|
||||||
df_ref = weakref.ref(df)
|
|
||||||
|
|
||||||
# Check for/clean NaNs
|
|
||||||
# Following this, we pass nan_policy='raise' to assert no NaNs remaining
|
|
||||||
df = df[[dep] + cols_for_formula(formula, df)]
|
|
||||||
df = check_nan(df, nan_policy)
|
|
||||||
|
|
||||||
# Ensure numeric type for dependent variable
|
|
||||||
if df[dep].dtype != 'float64':
|
|
||||||
df[dep] = df[dep].astype('float64')
|
|
||||||
|
|
||||||
# Convert pandas nullable types for independent variables as this breaks statsmodels
|
|
||||||
df = convert_pandas_nullable(df)
|
|
||||||
|
|
||||||
# Precompute design matrices
|
|
||||||
dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe')
|
|
||||||
|
|
||||||
# Fit full model
|
|
||||||
full_model = regress(model_class, df, dep, formula, nan_policy='raise', _dmatrices=dmatrices, model_kwargs=model_kwargs, fit_kwargs=fit_kwargs, **kwargs)
|
|
||||||
|
|
||||||
# Initialise bootstrap_results
|
|
||||||
bootstrap_results = {} # Dict mapping term raw names to bootstrap betas
|
|
||||||
for term in full_model.terms.values():
|
|
||||||
if isinstance(term, SingleTerm):
|
|
||||||
bootstrap_results[term.raw_name] = []
|
|
||||||
else:
|
|
||||||
for sub_term in term.categories.values():
|
|
||||||
bootstrap_results[sub_term.raw_name] = []
|
|
||||||
|
|
||||||
# Draw bootstrap samples and regress
|
|
||||||
dmatrices = dmatrices[0].join(dmatrices[1])
|
|
||||||
|
|
||||||
for i in tqdm(range(samples)):
|
|
||||||
bootstrap_rows = dmatrices.sample(len(df), replace=True)
|
|
||||||
model = model_class(endog=bootstrap_rows.iloc[:,0], exog=bootstrap_rows.iloc[:,1:], **model_kwargs)
|
|
||||||
model.formula = dep + ' ~ ' + formula
|
|
||||||
|
|
||||||
result = model.fit(disp=False, **fit_kwargs)
|
|
||||||
|
|
||||||
for raw_name, raw_beta in zip(model.exog_names, result._results.params):
|
|
||||||
bootstrap_results[raw_name].append(raw_beta)
|
|
||||||
|
|
||||||
# Combine bootstrap results
|
|
||||||
terms = {}
|
|
||||||
for term_name, term in full_model.terms.items():
|
|
||||||
if isinstance(term, SingleTerm):
|
|
||||||
bootstrap_betas = bootstrap_results[term.raw_name]
|
|
||||||
bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
|
|
||||||
bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
|
|
||||||
terms[term_name] = SingleTerm(term.raw_name, Estimate(term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue)
|
|
||||||
else:
|
|
||||||
categories = {}
|
|
||||||
for sub_term_name, sub_term in term.categories.items():
|
|
||||||
bootstrap_betas = bootstrap_results[sub_term.raw_name]
|
|
||||||
bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas)
|
|
||||||
bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue)
|
|
||||||
categories[sub_term_name] = SingleTerm(sub_term.raw_name, Estimate(sub_term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue)
|
|
||||||
terms[term_name] = CategoricalTerm(categories, term.ref_category)
|
|
||||||
|
|
||||||
return RegressionResult(
|
|
||||||
model_class, df_ref, dep, formula, full_model.regress_kwargs,
|
|
||||||
None,
|
|
||||||
full_model.full_name, full_model.model_name, full_model.fit_method,
|
|
||||||
dep, full_model.nobs, full_model.dof_model, datetime.now(), 'Bootstrap',
|
|
||||||
terms,
|
|
||||||
full_model.llf, full_model.llnull,
|
|
||||||
full_model.dof_resid, full_model.rsquared, full_model.f_statistic,
|
|
||||||
full_model.comments,
|
|
||||||
full_model.exp
|
|
||||||
)
|
|
||||||
|
|
||||||
def logit_then_regress(model_class, df, dep, formula, *, nan_policy='warn', **kwargs):
|
def logit_then_regress(model_class, df, dep, formula, *, nan_policy='warn', **kwargs):
|
||||||
"""
|
"""
|
||||||
Perform logistic regression, then use parameters as start parameters for desired regression
|
Perform logistic regression, then use parameters as start parameters for desired regression
|
||||||
@ -989,7 +978,7 @@ class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel):
|
|||||||
terms = {t: SingleTerm(t, Estimate(b, ci0, ci1), p) for t, b, ci0, ci1, p in zip(model['terms'], model['coefficients'], model['ci.lower'], model['ci.upper'], model['prob'])}
|
terms = {t: SingleTerm(t, Estimate(b, ci0, ci1), p) for t, b, ci0, ci1, p in zip(model['terms'], model['coefficients'], model['ci.lower'], model['ci.upper'], model['prob'])}
|
||||||
|
|
||||||
return RegressionResult(
|
return RegressionResult(
|
||||||
None, None, None, None, None, # Set in regress()
|
None, None, None, None, None, None, None, # Set in regress()
|
||||||
model,
|
model,
|
||||||
'Penalised Logistic Regression', 'Logit', 'Penalised ML',
|
'Penalised Logistic Regression', 'Logit', 'Penalised ML',
|
||||||
model['n'][0], model['df'][0], datetime.now(), 'nonrobust',
|
model['n'][0], model['df'][0], datetime.now(), 'nonrobust',
|
||||||
|
Loading…
Reference in New Issue
Block a user