diff --git a/yli/__init__.py b/yli/__init__.py index fff4bac..a3c0ce3 100644 --- a/yli/__init__.py +++ b/yli/__init__.py @@ -19,7 +19,7 @@ from .config import config from .descriptives import auto_descriptives from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted -from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, regress_bootstrap, vif +from .regress import OrdinalLogit, PenalisedLogit, brant, logit_then_regress, regress, regress_bootstrap, vif from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, ttest_ind def reload_me(): diff --git a/yli/regress.py b/yli/regress.py index 7153f61..4d7edf1 100644 --- a/yli/regress.py +++ b/yli/regress.py @@ -31,7 +31,7 @@ import warnings from .bayes_factors import BayesFactor, bayesfactor_afbf from .config import config -from .sig_tests import FTestResult +from .sig_tests import ChiSquaredResult, FTestResult from .utils import Estimate, PValueStyle, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term def vif(df, formula=None, *, nan_policy='warn'): @@ -94,7 +94,7 @@ def vif(df, formula=None, *, nan_policy='warn'): # ---------- # Regression -class LikelihoodRatioTestResult: +class LikelihoodRatioTestResult(ChiSquaredResult): """ Result of a likelihood ratio test for regression @@ -102,17 +102,7 @@ class LikelihoodRatioTestResult: """ def __init__(self, statistic, dof, pvalue): - #: Likelihood ratio test statistic (*float*) - self.statistic = statistic - #: Degrees of freedom for the likelihood ratio test statistic (*int*) - self.dof = dof - #: *p* value for the likelihood ratio test (*float*) - self.pvalue = pvalue - - def __repr__(self): - if config.repr_is_summary: - return self.summary() - return super().__repr__() + super().__init__(statistic, dof, pvalue) def _repr_html_(self): return 'LR({}) = {:.2f}; p {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML)) @@ -913,3 +903,152 @@ class OrdinalLogit(statsmodels.miscmodels.ordinal_model.OrderedModel): def transform_reverse_threshold_params(self, params): return params[:-1] + +class _Dummy: pass + +def _wald_test(param_names, params, formula, vcov, df): + # Hack! Piggyback off statsmodels to compute a Wald test + + lmr = statsmodels.base.model.LikelihoodModelResults(model=None, params=None) + lmr.model = _Dummy() + lmr.model.data = _Dummy() + lmr.model.data.cov_names = param_names + lmr.params = params + lmr.df_resid = df + + return lmr.wald_test(formula, cov_p=vcov, use_f=False, scalar=True) + +class BrantResult: + # TODO: Documentation + + def __init__(self, tests): + #: Results for Brant test on each coefficient (*Dict[str, ChiSquaredResult]*) + self.tests = tests + + def __repr__(self): + if config.repr_is_summary: + return self.summary() + return super().__repr__() + + def _repr_html_(self): + out = '' + + for raw_name, test in self.tests.items(): + out += ''.format(raw_name, test.statistic, test.dof, fmt_p(test.pvalue, PValueStyle.TABULAR | PValueStyle.HTML)) + + out += '
Brant Test Results
χ2dfp
{}{:.2f}{:.0f}{}
' + return out + + def summary(self): + """ + Return a stringified summary of the *χ*:sup:`2` test + + :rtype: str + """ + + # FIXME + return 'FIXME' + +def brant( + df, dep, formula, *, + nan_policy='warn', + fit_kwargs=None, + method=None, maxiter=None, start_params=None, # common fit_kwargs +): + # TODO: Documentation + + if fit_kwargs is None: + fit_kwargs = {} + if method is not None: + fit_kwargs['method'] = method + if maxiter is not None: + fit_kwargs['maxiter'] = maxiter + if start_params is not None: + fit_kwargs['start_params'] = start_params + + # Check for/clean NaNs + # Following this, we pass nan_policy='raise' to assert no NaNs remaining + df = df[[dep] + cols_for_formula(formula, df)] + df = check_nan(df, nan_policy) + + # Ensure numeric type for dependent variable + if df[dep].dtype != 'float64': + df[dep] = df[dep].astype('float64') + + # Convert pandas nullable types for independent variables as this breaks statsmodels + df = convert_pandas_nullable(df) + + # Precompute design matrix for RHS + # This is also X+ in Brant paper + dmatrix_right = patsy.dmatrix(formula, df, return_type='dataframe') + dmatrix_right.reset_index(drop=True, inplace=True) # Otherwise this confuses matrix multiplication + + # Fit individual logistic regressions + logit_models = [] + for upper_limit in sorted(df[dep].unique())[:-1]: # FIXME: Sort order + dep_dichotomous = (df[dep] <= upper_limit).astype(int).reset_index(drop=True) + logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False, **fit_kwargs) + + if not logit_result.mle_retvals['converged']: + raise Exception('Maximum likelihood estimation failed to converge for {} <= {}. Check raw_result.mle_retvals.'.format(dep, upper_limit)) + + if pd.isna(logit_result.bse).any(): + raise Exception('Regression returned NaN standard errors for {} <= {}.'.format(dep, upper_limit)) + + logit_models.append(logit_result) + + logit_betas = np.array([model._results.params for model in logit_models]).T + logit_pihat = np.array([expit(-model.fittedvalues) for model in logit_models]).T # Predicted probabilities + + # vcov is the variance-covariance matrix of all individually fitted betas across all terms + + # | model 1 | model 2 | model 3 | ... + # | term 1 | term 2 | term 1 | term 2 | term 1 | term 2 | ... + # model 1 | term 1 | + # | term 2 | + # model 2 | term 1 | + # | term 2 | + # ... + + n_terms = len(dmatrix_right.columns) - 1 # number of beta terms (excluding intercept) + n_betas = len(logit_models) * n_terms + vcov = np.zeros((n_betas, n_betas)) + + # Populate the variance-covariance matrix for comparisons between individually fitted models + for j in range(0, len(logit_models) - 1): + for l in range(j + 1, len(logit_models)): + Wjj = np.diag(logit_pihat[:,j] - logit_pihat[:,j] * logit_pihat[:,j]) + Wjl = np.diag(logit_pihat[:,l] - logit_pihat[:,j] * logit_pihat[:,l]) + Wll = np.diag(logit_pihat[:,l] - logit_pihat[:,l] * logit_pihat[:,l]) + + matrix_result = np.linalg.inv(dmatrix_right.T @ Wjj @ dmatrix_right) @ dmatrix_right.T @ Wjl @ dmatrix_right @ np.linalg.inv(dmatrix_right.T @ Wll @ dmatrix_right) + j_vs_l_vcov = np.asarray(matrix_result)[1:,1:] # Asymptotic covariance for j,l + + vcov[j*n_terms:(j+1)*n_terms, l*n_terms:(l+1)*n_terms] = j_vs_l_vcov + vcov[l*n_terms:(l+1)*n_terms, j*n_terms:(j+1)*n_terms] = j_vs_l_vcov + + # Populate the variance-covariance matrix within each individually fitted model + for i in range(len(logit_models)): + vcov[i*n_terms:(i+1)*n_terms, i*n_terms:(i+1)*n_terms] = logit_models[i]._results.cov_params()[1:,1:] + + # ------------------ + # Perform Wald tests + + beta_names = ['{}_{}'.format(raw_name, i) for i in range(len(logit_models)) for raw_name in dmatrix_right.columns[1:]] + wald_results = {} + + # Omnibus test + constraints = [' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models))) for raw_name in dmatrix_right.columns[1:]] + constraint = ', '.join(constraints) + df = (len(logit_models) - 1) * (len(dmatrix_right.columns) - 1) # df = (number of levels minus 2) * (number of terms excluding intercept) + wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, df) + wald_results['Omnibus'] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue) + + # Individual terms + for raw_name in dmatrix_right.columns[1:]: + constraint = ' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models))) + df = len(logit_models) - 1 # df = (number of levels minus 2) + wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, df) + wald_results[raw_name] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue) + + return BrantResult(wald_results) diff --git a/yli/sig_tests.py b/yli/sig_tests.py index 364f63e..14ba7a8 100644 --- a/yli/sig_tests.py +++ b/yli/sig_tests.py @@ -483,7 +483,35 @@ def mannwhitney(df, dep, ind, *, nan_policy='warn', brunnermunzel=True, use_cont # ------------------------ # Pearson chi-squared test -class PearsonChiSquaredResult: +class ChiSquaredResult: + # TODO: Documentation + + def __init__(self, statistic, dof, pvalue): + #: *χ*:sup:`2` statistic (*float*) + self.statistic = statistic + #: Degrees of freedom for the *χ*:sup:`2` distribution (*int*) + self.dof = dof + #: *p* value for the *χ*:sup:`2` test (*float*) + self.pvalue = pvalue + + def __repr__(self): + if config.repr_is_summary: + return self.summary() + return super().__repr__() + + def _repr_html_(self): + return 'χ2({}) = {:.2f}; p {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML)) + + def summary(self): + """ + Return a stringified summary of the *χ*:sup:`2` test + + :rtype: str + """ + + return 'χ²({}) = {:.2f}; p {}'.format(self.ct, self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION)) + +class PearsonChiSquaredResult(ChiSquaredResult): """ Result of a Pearson *χ*:sup:`2` test @@ -491,24 +519,15 @@ class PearsonChiSquaredResult: """ def __init__(self, ct, statistic, dof, pvalue, oddsratio=None, riskratio=None): + super().__init__(statistic, dof, pvalue) + #: Contingency table for the observations (*DataFrame*) self.ct = ct - #: *χ*:sup:`2` statistic (*float*) - self.statistic = statistic - #: Degrees of freedom for the *χ*:sup:`2` distribution (*int*) - self.dof = dof - #: *p* value for the *χ*:sup:`2` test (*float*) - self.pvalue = pvalue #: Odds ratio (*float*; *None* if not a 2×2 table) self.oddsratio = oddsratio #: Risk ratio (*float*; *None* if not a 2×2 table) self.riskratio = riskratio - def __repr__(self): - if config.repr_is_summary: - return self.summary() - return super().__repr__() - def _repr_html_(self): if self.oddsratio is not None: return '{0}
χ2({1}) = {2:.2f}; p {3}
OR ({4:g}% CI) = {5}
RR ({4:g}% CI) = {6}'.format(