diff --git a/yli/__init__.py b/yli/__init__.py index fff4bac..0b527bd 100644 --- a/yli/__init__.py +++ b/yli/__init__.py @@ -19,7 +19,7 @@ from .config import config from .descriptives import auto_descriptives from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted -from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, regress_bootstrap, vif +from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, vif from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, ttest_ind def reload_me(): diff --git a/yli/regress.py b/yli/regress.py index aca9deb..3f6cbf8 100644 --- a/yli/regress.py +++ b/yli/regress.py @@ -125,7 +125,7 @@ class RegressionResult: """ def __init__(self, - model_class, df, dep, formula, regress_kwargs, + model_class, df, dep, formula, nan_policy, model_kwargs, fit_kwargs, raw_result, full_name, model_name, fit_method, nobs, dof_model, fitted_dt, cov_type, @@ -136,16 +136,20 @@ class RegressionResult: exp ): # Data about how model fitted - #: statsmodels model class used for fitting the model + #: See :func:`yli.regress` self.model_class = model_class #: Data fitted (*weakref* to *DataFrame*) self.df = df - #: Name of the dependent variable (*str*) + #: See :func:`yli.regress` self.dep = dep - #: Formula for the model (*str*) + #: See :func:`yli.regress` self.formula = formula - #: Other kwargs passed to :func:`yli.regress` (*dict*) - self.regress_kwargs = regress_kwargs + #: See :func:`yli.regress` + self.nan_policy = nan_policy + #: See :func:`yli.regress` + self.model_kwargs = model_kwargs + #: See :func:`yli.regress` + self.fit_kwargs = fit_kwargs #: Raw result from statsmodels *model.fit* self.raw_result = raw_result @@ -260,9 +264,10 @@ class RegressionResult: dep = self.dep # Check for/clean NaNs + # NaN warning/error will already have been handled in regress, so here we pass nan_policy='omit' # Following this, we pass nan_policy='raise' to assert no NaNs remaining df = df[[dep] + cols_for_formula(self.formula, df)] - df = check_nan(df, self.regress_kwargs['nan_policy']) + df = check_nan(df, 'omit') # Ensure numeric type for dependent variable if df[dep].dtype != 'float64': @@ -280,7 +285,7 @@ class RegressionResult: logit_models = [] for upper_limit in sorted(df[dep].unique())[:-1]: # FIXME: Sort order dep_dichotomous = (df[dep] <= upper_limit).astype(int).reset_index(drop=True) - logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False, **self.regress_kwargs.get('fit_kwargs', {})) + logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False, **self.fit_kwargs) if not logit_result.mle_retvals['converged']: raise Exception('Maximum likelihood estimation failed to converge for {} <= {}. Check raw_result.mle_retvals.'.format(dep, upper_limit)) @@ -346,6 +351,91 @@ class RegressionResult: return BrantResult(wald_results) + def bootstrap(self, samples=1000): + """ + Fit a statsmodels regression model, using bootstrapping to compute confidence intervals and *p* values + + :param samples: Number of bootstrap samples to draw + :type samples: int + + :rtype: :class:`yli.regress.RegressionResult` + """ + + df = self.df() + if df is None: + raise Exception('Referenced DataFrame has been dropped') + dep = self.dep + + # Check for/clean NaNs + # NaN warning/error will already have been handled in regress, so here we pass nan_policy='omit' + # Following this, we pass nan_policy='raise' to assert no NaNs remaining + df = df[[dep] + cols_for_formula(self.formula, df)] + df = check_nan(df, 'omit') + + # Ensure numeric type for dependent variable + if df[dep].dtype != 'float64': + df[dep] = df[dep].astype('float64') + + # Convert pandas nullable types for independent variables as this breaks statsmodels + df = convert_pandas_nullable(df) + + # Precompute design matrices + dmatrices = patsy.dmatrices(dep + ' ~ ' + self.formula, df, return_type='dataframe') + + # Fit full model + #full_model = regress(self.model_class, df, dep, self.formula, nan_policy='raise', _dmatrices=dmatrices, model_kwargs=self.model_kwargs, fit_kwargs=self.fit_kwargs) + + # Initialise bootstrap_results + bootstrap_results = {} # Dict mapping term raw names to bootstrap betas + for term in self.terms.values(): + if isinstance(term, SingleTerm): + bootstrap_results[term.raw_name] = [] + else: + for sub_term in term.categories.values(): + bootstrap_results[sub_term.raw_name] = [] + + # Draw bootstrap samples and regress + dmatrices = dmatrices[0].join(dmatrices[1]) + + for i in tqdm(range(samples)): + bootstrap_rows = dmatrices.sample(len(df), replace=True) + model = self.model_class(endog=bootstrap_rows.iloc[:,0], exog=bootstrap_rows.iloc[:,1:], **self.model_kwargs) + model.formula = dep + ' ~ ' + self.formula + + result = model.fit(disp=False, **self.fit_kwargs) + + for raw_name, raw_beta in zip(model.exog_names, result._results.params): + bootstrap_results[raw_name].append(raw_beta) + + # Combine bootstrap results + terms = {} + for term_name, term in self.terms.items(): + if isinstance(term, SingleTerm): + bootstrap_betas = bootstrap_results[term.raw_name] + bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas) + bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue) + terms[term_name] = SingleTerm(term.raw_name, Estimate(term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue) + else: + categories = {} + for sub_term_name, sub_term in term.categories.items(): + bootstrap_betas = bootstrap_results[sub_term.raw_name] + bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas) + bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue) + categories[sub_term_name] = SingleTerm(sub_term.raw_name, Estimate(sub_term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue) + terms[term_name] = CategoricalTerm(categories, term.ref_category) + + return RegressionResult( + self.model_class, self.df, dep, self.formula, self.nan_policy, self.model_kwargs, self.fit_kwargs, + None, + self.full_name, self.model_name, self.fit_method, + self.nobs, self.dof_model, datetime.now(), 'Bootstrap', + terms, + self.llf, self.llnull, + self.dof_resid, self.rsquared, self.f_statistic, + self.comments, + self.exp + ) + def _header_table(self, html): """Return the entries for the header table""" @@ -641,9 +731,6 @@ def regress( if start_params is not None: fit_kwargs['start_params'] = start_params - # Collect kwargs to store in result later - regress_kwargs = dict(nan_policy=nan_policy, fit_kwargs=fit_kwargs) - # Autodetect whether to exponentiate if exp is None: if model_class in (sm.Logit, sm.Poisson, PenalisedLogit): @@ -688,7 +775,9 @@ def regress( result.df = df_ref result.dep = dep result.formula = formula - result.regress_kwargs = regress_kwargs + result.nan_policy = nan_policy + result.model_kwargs = model_kwargs + result.fit_kwargs = fit_kwargs result.exp = exp return result @@ -770,7 +859,7 @@ def regress( full_name = 'Robust {}'.format(full_name) return RegressionResult( - model_class, df_ref, dep, formula, regress_kwargs, + model_class, df_ref, dep, formula, nan_policy, model_kwargs, fit_kwargs, result, full_name, model_class.__name__, method_name, result.nobs, result.df_model, datetime.now(), getattr(result, 'cov_type', 'nonrobust'), @@ -781,106 +870,6 @@ def regress( exp ) -def regress_bootstrap( - model_class, df, dep, formula, *, - nan_policy='warn', - model_kwargs=None, fit_kwargs=None, - samples=1000, - **kwargs -): - """ - Fit a statsmodels regression model, using bootstrapping to compute confidence intervals and *p* values - - :param model_class: See :func:`yli.regress` - :param df: See :func:`yli.regress` - :param dep: See :func:`yli.regress` - :param formula: See :func:`yli.regress` - :param nan_policy: See :func:`yli.regress` - :param model_kwargs: See :func:`yli.regress` - :param fit_kwargs: See :func:`yli.regress` - :param samples: Number of bootstrap samples to draw - :type samples: int - :param kwargs: Other arguments to pass to :func:`yli.regress` - - :rtype: :class:`yli.regress.RegressionResult` - """ - - if model_kwargs is None: - model_kwargs = {} - if fit_kwargs is None: - fit_kwargs = {} - - df_ref = weakref.ref(df) - - # Check for/clean NaNs - # Following this, we pass nan_policy='raise' to assert no NaNs remaining - df = df[[dep] + cols_for_formula(formula, df)] - df = check_nan(df, nan_policy) - - # Ensure numeric type for dependent variable - if df[dep].dtype != 'float64': - df[dep] = df[dep].astype('float64') - - # Convert pandas nullable types for independent variables as this breaks statsmodels - df = convert_pandas_nullable(df) - - # Precompute design matrices - dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe') - - # Fit full model - full_model = regress(model_class, df, dep, formula, nan_policy='raise', _dmatrices=dmatrices, model_kwargs=model_kwargs, fit_kwargs=fit_kwargs, **kwargs) - - # Initialise bootstrap_results - bootstrap_results = {} # Dict mapping term raw names to bootstrap betas - for term in full_model.terms.values(): - if isinstance(term, SingleTerm): - bootstrap_results[term.raw_name] = [] - else: - for sub_term in term.categories.values(): - bootstrap_results[sub_term.raw_name] = [] - - # Draw bootstrap samples and regress - dmatrices = dmatrices[0].join(dmatrices[1]) - - for i in tqdm(range(samples)): - bootstrap_rows = dmatrices.sample(len(df), replace=True) - model = model_class(endog=bootstrap_rows.iloc[:,0], exog=bootstrap_rows.iloc[:,1:], **model_kwargs) - model.formula = dep + ' ~ ' + formula - - result = model.fit(disp=False, **fit_kwargs) - - for raw_name, raw_beta in zip(model.exog_names, result._results.params): - bootstrap_results[raw_name].append(raw_beta) - - # Combine bootstrap results - terms = {} - for term_name, term in full_model.terms.items(): - if isinstance(term, SingleTerm): - bootstrap_betas = bootstrap_results[term.raw_name] - bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas) - bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue) - terms[term_name] = SingleTerm(term.raw_name, Estimate(term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue) - else: - categories = {} - for sub_term_name, sub_term in term.categories.items(): - bootstrap_betas = bootstrap_results[sub_term.raw_name] - bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas) - bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue) - categories[sub_term_name] = SingleTerm(sub_term.raw_name, Estimate(sub_term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue) - terms[term_name] = CategoricalTerm(categories, term.ref_category) - - return RegressionResult( - model_class, df_ref, dep, formula, full_model.regress_kwargs, - None, - full_model.full_name, full_model.model_name, full_model.fit_method, - dep, full_model.nobs, full_model.dof_model, datetime.now(), 'Bootstrap', - terms, - full_model.llf, full_model.llnull, - full_model.dof_resid, full_model.rsquared, full_model.f_statistic, - full_model.comments, - full_model.exp - ) - def logit_then_regress(model_class, df, dep, formula, *, nan_policy='warn', **kwargs): """ Perform logistic regression, then use parameters as start parameters for desired regression @@ -989,7 +978,7 @@ class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel): terms = {t: SingleTerm(t, Estimate(b, ci0, ci1), p) for t, b, ci0, ci1, p in zip(model['terms'], model['coefficients'], model['ci.lower'], model['ci.upper'], model['prob'])} return RegressionResult( - None, None, None, None, None, # Set in regress() + None, None, None, None, None, None, None, # Set in regress() model, 'Penalised Logistic Regression', 'Logit', 'Penalised ML', model['n'][0], model['df'][0], datetime.now(), 'nonrobust',