diff --git a/yli/__init__.py b/yli/__init__.py index a3c0ce3..fff4bac 100644 --- a/yli/__init__.py +++ b/yli/__init__.py @@ -19,7 +19,7 @@ from .config import config from .descriptives import auto_descriptives from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted -from .regress import OrdinalLogit, PenalisedLogit, brant, logit_then_regress, regress, regress_bootstrap, vif +from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, regress_bootstrap, vif from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, ttest_ind def reload_me(): diff --git a/yli/regress.py b/yli/regress.py index 4d7edf1..aca9deb 100644 --- a/yli/regress.py +++ b/yli/regress.py @@ -28,6 +28,7 @@ from tqdm import tqdm from datetime import datetime import itertools import warnings +import weakref from .bayes_factors import BayesFactor, bayesfactor_afbf from .config import config @@ -124,15 +125,28 @@ class RegressionResult: """ def __init__(self, + model_class, df, dep, formula, regress_kwargs, raw_result, full_name, model_name, fit_method, - dep, nobs, dof_model, fitted_dt, cov_type, + nobs, dof_model, fitted_dt, cov_type, terms, llf, llnull, dof_resid, rsquared, f_statistic, comments, exp ): + # Data about how model fitted + #: statsmodels model class used for fitting the model + self.model_class = model_class + #: Data fitted (*weakref* to *DataFrame*) + self.df = df + #: Name of the dependent variable (*str*) + self.dep = dep + #: Formula for the model (*str*) + self.formula = formula + #: Other kwargs passed to :func:`yli.regress` (*dict*) + self.regress_kwargs = regress_kwargs + #: Raw result from statsmodels *model.fit* self.raw_result = raw_result @@ -145,8 +159,6 @@ class RegressionResult: self.fit_method = fit_method # Basic fitted model information - #: Name of the dependent variable (*str*) - self.dep = dep #: Number of observations (*int*) self.nobs = nobs #: Degrees of freedom for the model (*int*) @@ -239,6 +251,101 @@ class RegressionResult: else: return bf01.invert() + def brant(self): + # TODO: Documentation + + df = self.df() + if df is None: + raise Exception('Referenced DataFrame has been dropped') + dep = self.dep + + # Check for/clean NaNs + # Following this, we pass nan_policy='raise' to assert no NaNs remaining + df = df[[dep] + cols_for_formula(self.formula, df)] + df = check_nan(df, self.regress_kwargs['nan_policy']) + + # Ensure numeric type for dependent variable + if df[dep].dtype != 'float64': + df[dep] = df[dep].astype('float64') + + # Convert pandas nullable types for independent variables as this breaks statsmodels + df = convert_pandas_nullable(df) + + # Precompute design matrix for RHS + # This is also X+ in Brant paper + dmatrix_right = patsy.dmatrix(self.formula, df, return_type='dataframe') + dmatrix_right.reset_index(drop=True, inplace=True) # Otherwise this confuses matrix multiplication + + # Fit individual logistic regressions + logit_models = [] + for upper_limit in sorted(df[dep].unique())[:-1]: # FIXME: Sort order + dep_dichotomous = (df[dep] <= upper_limit).astype(int).reset_index(drop=True) + logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False, **self.regress_kwargs.get('fit_kwargs', {})) + + if not logit_result.mle_retvals['converged']: + raise Exception('Maximum likelihood estimation failed to converge for {} <= {}. Check raw_result.mle_retvals.'.format(dep, upper_limit)) + + if pd.isna(logit_result.bse).any(): + raise Exception('Regression returned NaN standard errors for {} <= {}.'.format(dep, upper_limit)) + + logit_models.append(logit_result) + + logit_betas = np.array([model._results.params for model in logit_models]).T + logit_pihat = np.array([expit(-model.fittedvalues) for model in logit_models]).T # Predicted probabilities + + # vcov is the variance-covariance matrix of all individually fitted betas across all terms + + # | model 1 | model 2 | model 3 | ... + # | term 1 | term 2 | term 1 | term 2 | term 1 | term 2 | ... + # model 1 | term 1 | + # | term 2 | + # model 2 | term 1 | + # | term 2 | + # ... + + n_terms = len(dmatrix_right.columns) - 1 # number of beta terms (excluding intercept) + n_betas = len(logit_models) * n_terms + vcov = np.zeros((n_betas, n_betas)) + + # Populate the variance-covariance matrix for comparisons between individually fitted models + for j in range(0, len(logit_models) - 1): + for l in range(j + 1, len(logit_models)): + Wjj = np.diag(logit_pihat[:,j] - logit_pihat[:,j] * logit_pihat[:,j]) + Wjl = np.diag(logit_pihat[:,l] - logit_pihat[:,j] * logit_pihat[:,l]) + Wll = np.diag(logit_pihat[:,l] - logit_pihat[:,l] * logit_pihat[:,l]) + + matrix_result = np.linalg.inv(dmatrix_right.T @ Wjj @ dmatrix_right) @ dmatrix_right.T @ Wjl @ dmatrix_right @ np.linalg.inv(dmatrix_right.T @ Wll @ dmatrix_right) + j_vs_l_vcov = np.asarray(matrix_result)[1:,1:] # Asymptotic covariance for j,l + + vcov[j*n_terms:(j+1)*n_terms, l*n_terms:(l+1)*n_terms] = j_vs_l_vcov + vcov[l*n_terms:(l+1)*n_terms, j*n_terms:(j+1)*n_terms] = j_vs_l_vcov + + # Populate the variance-covariance matrix within each individually fitted model + for i in range(len(logit_models)): + vcov[i*n_terms:(i+1)*n_terms, i*n_terms:(i+1)*n_terms] = logit_models[i]._results.cov_params()[1:,1:] + + # ------------------ + # Perform Wald tests + + beta_names = ['{}_{}'.format(raw_name, i) for i in range(len(logit_models)) for raw_name in dmatrix_right.columns[1:]] + wald_results = {} + + # Omnibus test + constraints = [' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models))) for raw_name in dmatrix_right.columns[1:]] + constraint = ', '.join(constraints) + df = (len(logit_models) - 1) * (len(dmatrix_right.columns) - 1) # df = (number of levels minus 2) * (number of terms excluding intercept) + wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, df) + wald_results['Omnibus'] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue) + + # Individual terms + for raw_name in dmatrix_right.columns[1:]: + constraint = ' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models))) + df = len(logit_models) - 1 # df = (number of levels minus 2) + wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, df) + wald_results[raw_name] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue) + + return BrantResult(wald_results) + def _header_table(self, html): """Return the entries for the header table""" @@ -534,6 +641,9 @@ def regress( if start_params is not None: fit_kwargs['start_params'] = start_params + # Collect kwargs to store in result later + regress_kwargs = dict(nan_policy=nan_policy, fit_kwargs=fit_kwargs) + # Autodetect whether to exponentiate if exp is None: if model_class in (sm.Logit, sm.Poisson, PenalisedLogit): @@ -543,6 +653,8 @@ def regress( else: exp = False + df_ref = weakref.ref(df) + if _dmatrices is None: # Check for/clean NaNs df = df[[dep] + cols_for_formula(formula, df)] @@ -572,6 +684,11 @@ def regress( if isinstance(result, RegressionResult): # Already processed! + result.model_class = model_class + result.df = df_ref + result.dep = dep + result.formula = formula + result.regress_kwargs = regress_kwargs result.exp = exp return result @@ -653,9 +770,10 @@ def regress( full_name = 'Robust {}'.format(full_name) return RegressionResult( + model_class, df_ref, dep, formula, regress_kwargs, result, full_name, model_class.__name__, method_name, - dep, result.nobs, result.df_model, datetime.now(), getattr(result, 'cov_type', 'nonrobust'), + result.nobs, result.df_model, datetime.now(), getattr(result, 'cov_type', 'nonrobust'), terms, result.llf, llnull, getattr(result, 'df_resid', None), getattr(result, 'rsquared', None), getattr(result, 'fvalue', None), @@ -692,6 +810,8 @@ def regress_bootstrap( if fit_kwargs is None: fit_kwargs = {} + df_ref = weakref.ref(df) + # Check for/clean NaNs # Following this, we pass nan_policy='raise' to assert no NaNs remaining df = df[[dep] + cols_for_formula(formula, df)] @@ -750,6 +870,7 @@ def regress_bootstrap( terms[term_name] = CategoricalTerm(categories, term.ref_category) return RegressionResult( + model_class, df_ref, dep, formula, full_model.regress_kwargs, None, full_model.full_name, full_model.model_name, full_model.fit_method, dep, full_model.nobs, full_model.dof_model, datetime.now(), 'Bootstrap', @@ -868,9 +989,10 @@ class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel): terms = {t: SingleTerm(t, Estimate(b, ci0, ci1), p) for t, b, ci0, ci1, p in zip(model['terms'], model['coefficients'], model['ci.lower'], model['ci.upper'], model['prob'])} return RegressionResult( + None, None, None, None, None, # Set in regress() model, 'Penalised Logistic Regression', 'Logit', 'Penalised ML', - self.endog_names, model['n'][0], model['df'][0], datetime.now(), 'nonrobust', + model['n'][0], model['df'][0], datetime.now(), 'nonrobust', terms, model['loglik'][0], model['loglik'][1], None, None, None, @@ -948,107 +1070,3 @@ class BrantResult: # FIXME return 'FIXME' - -def brant( - df, dep, formula, *, - nan_policy='warn', - fit_kwargs=None, - method=None, maxiter=None, start_params=None, # common fit_kwargs -): - # TODO: Documentation - - if fit_kwargs is None: - fit_kwargs = {} - if method is not None: - fit_kwargs['method'] = method - if maxiter is not None: - fit_kwargs['maxiter'] = maxiter - if start_params is not None: - fit_kwargs['start_params'] = start_params - - # Check for/clean NaNs - # Following this, we pass nan_policy='raise' to assert no NaNs remaining - df = df[[dep] + cols_for_formula(formula, df)] - df = check_nan(df, nan_policy) - - # Ensure numeric type for dependent variable - if df[dep].dtype != 'float64': - df[dep] = df[dep].astype('float64') - - # Convert pandas nullable types for independent variables as this breaks statsmodels - df = convert_pandas_nullable(df) - - # Precompute design matrix for RHS - # This is also X+ in Brant paper - dmatrix_right = patsy.dmatrix(formula, df, return_type='dataframe') - dmatrix_right.reset_index(drop=True, inplace=True) # Otherwise this confuses matrix multiplication - - # Fit individual logistic regressions - logit_models = [] - for upper_limit in sorted(df[dep].unique())[:-1]: # FIXME: Sort order - dep_dichotomous = (df[dep] <= upper_limit).astype(int).reset_index(drop=True) - logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False, **fit_kwargs) - - if not logit_result.mle_retvals['converged']: - raise Exception('Maximum likelihood estimation failed to converge for {} <= {}. Check raw_result.mle_retvals.'.format(dep, upper_limit)) - - if pd.isna(logit_result.bse).any(): - raise Exception('Regression returned NaN standard errors for {} <= {}.'.format(dep, upper_limit)) - - logit_models.append(logit_result) - - logit_betas = np.array([model._results.params for model in logit_models]).T - logit_pihat = np.array([expit(-model.fittedvalues) for model in logit_models]).T # Predicted probabilities - - # vcov is the variance-covariance matrix of all individually fitted betas across all terms - - # | model 1 | model 2 | model 3 | ... - # | term 1 | term 2 | term 1 | term 2 | term 1 | term 2 | ... - # model 1 | term 1 | - # | term 2 | - # model 2 | term 1 | - # | term 2 | - # ... - - n_terms = len(dmatrix_right.columns) - 1 # number of beta terms (excluding intercept) - n_betas = len(logit_models) * n_terms - vcov = np.zeros((n_betas, n_betas)) - - # Populate the variance-covariance matrix for comparisons between individually fitted models - for j in range(0, len(logit_models) - 1): - for l in range(j + 1, len(logit_models)): - Wjj = np.diag(logit_pihat[:,j] - logit_pihat[:,j] * logit_pihat[:,j]) - Wjl = np.diag(logit_pihat[:,l] - logit_pihat[:,j] * logit_pihat[:,l]) - Wll = np.diag(logit_pihat[:,l] - logit_pihat[:,l] * logit_pihat[:,l]) - - matrix_result = np.linalg.inv(dmatrix_right.T @ Wjj @ dmatrix_right) @ dmatrix_right.T @ Wjl @ dmatrix_right @ np.linalg.inv(dmatrix_right.T @ Wll @ dmatrix_right) - j_vs_l_vcov = np.asarray(matrix_result)[1:,1:] # Asymptotic covariance for j,l - - vcov[j*n_terms:(j+1)*n_terms, l*n_terms:(l+1)*n_terms] = j_vs_l_vcov - vcov[l*n_terms:(l+1)*n_terms, j*n_terms:(j+1)*n_terms] = j_vs_l_vcov - - # Populate the variance-covariance matrix within each individually fitted model - for i in range(len(logit_models)): - vcov[i*n_terms:(i+1)*n_terms, i*n_terms:(i+1)*n_terms] = logit_models[i]._results.cov_params()[1:,1:] - - # ------------------ - # Perform Wald tests - - beta_names = ['{}_{}'.format(raw_name, i) for i in range(len(logit_models)) for raw_name in dmatrix_right.columns[1:]] - wald_results = {} - - # Omnibus test - constraints = [' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models))) for raw_name in dmatrix_right.columns[1:]] - constraint = ', '.join(constraints) - df = (len(logit_models) - 1) * (len(dmatrix_right.columns) - 1) # df = (number of levels minus 2) * (number of terms excluding intercept) - wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, df) - wald_results['Omnibus'] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue) - - # Individual terms - for raw_name in dmatrix_right.columns[1:]: - constraint = ' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models))) - df = len(logit_models) - 1 # df = (number of levels minus 2) - wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, df) - wald_results[raw_name] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue) - - return BrantResult(wald_results)