# scipy-yli: Helpful SciPy utilities and recipes # Copyright © 2022–2023 Lee Yingtong Li (RunasSudo) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import numpy as np import pandas as pd import patsy from scipy import stats from scipy.special import expit import statsmodels, statsmodels.miscmodels.ordinal_model import statsmodels.api as sm from statsmodels.iolib.table import SimpleTable from statsmodels.stats.outliers_influence import variance_inflation_factor from tqdm import tqdm from datetime import datetime import itertools import warnings import weakref from .bayes_factors import BayesFactor, bayesfactor_afbf from .config import config from .shap import ShapResult from .sig_tests import ChiSquaredResult, FTestResult from .utils import Estimate, PValueStyle, as_numeric, check_nan, cols_for_formula, convert_pandas_nullable, fmt_p, formula_factor_ref_category, parse_patsy_term def vif(df, formula=None, *, nan_policy='warn'): """ Calculate the variance inflation factor (VIF) for each variable in *df* :param df: Data to calculate the VIF for :type df: DataFrame :param formula: If specified, calculate the VIF only for the variables in the formula :type formula: str :return: The variance inflation factors :rtype: Series **Example:** .. code-block:: df = pd.DataFrame({ 'D': [68.58, 67.33, 67.33, ...], 'T1': [14, 10, 10, ...], 'T2': [46, 73, 85, ...], ... }) yli.vif(df[['D', 'T1', 'T2', ...]]) .. code-block:: text D 8.318301 T1 6.081590 T2 2.457122 ... dtype: float64 The output shows the variance inflation factor for each variable in *df*. """ if formula: # Only consider columns in the formula df = df[cols_for_formula(formula, df)] # Check for/clean NaNs df = check_nan(df, nan_policy) # Convert all to float64 otherwise statsmodels chokes with "ufunc 'isfinite' not supported for the input types ..." df = pd.get_dummies(df, drop_first=True) # Convert categorical dtypes df = df.astype('float64') # Convert all other dtypes # Add intercept column orig_columns = list(df.columns) df['Intercept'] = [1] * len(df) vifs = {} for i, col in enumerate(orig_columns): vifs[col] = variance_inflation_factor(df, i) return pd.Series(vifs) # ---------- # Regression class LikelihoodRatioTestResult(ChiSquaredResult): """ Result of a likelihood ratio test for regression See :meth:`RegressionResult.lrtest_null`. """ def __init__(self, statistic, dof, pvalue): super().__init__(statistic, dof, pvalue) def _repr_html_(self): return 'LR({}) = {:.2f}; p {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION | PValueStyle.HTML)) def summary(self): """ Return a stringified summary of the likelihood ratio test :rtype: str """ return 'LR({}) = {:.2f}; p {}'.format(self.dof, self.statistic, fmt_p(self.pvalue, PValueStyle.RELATION)) class RegressionResult: """ Result of a regression See :func:`yli.regress`. """ def __init__(self, model_class, df, dep, formula, nan_policy, model_kwargs, fit_kwargs, raw_result, full_name, model_name, fit_method, nobs, nevents, dof_model, fitted_dt, cov_type, terms, ll_model, ll_null, dof_resid, rsquared, f_statistic, comments, exp ): # Data about how model fitted #: See :func:`yli.regress` self.model_class = model_class #: Data fitted (*weakref* to *DataFrame*) self.df = df #: See :func:`yli.regress` self.dep = dep #: See :func:`yli.regress` self.formula = formula #: See :func:`yli.regress` self.nan_policy = nan_policy #: See :func:`yli.regress` self.model_kwargs = model_kwargs #: See :func:`yli.regress` self.fit_kwargs = fit_kwargs #: Raw result from statsmodels *model.fit* self.raw_result = raw_result # Information for display #: Full name of the regression model type (*str*) self.full_name = full_name #: Short name of the regression model type (*str*) self.model_name = model_name #: Method for fitting the regression model (*str*) self.fit_method = fit_method # Basic fitted model information #: Number of observations (*int*) self.nobs = nobs #: Number of events (*int*, time-to-event models only) self.nevents = nevents #: Degrees of freedom for the model (*int*) self.dof_model = dof_model #: Date and time of fitting the model (Python *datetime*) self.fitted_dt = fitted_dt #: Method for computing the covariance matrix (*str*) self.cov_type = cov_type # Regression coefficients/p values #: Coefficients and *p* values for each term in the model (*dict* of :class:`SingleTerm` or :class:`CategoricalTerm`) self.terms = terms # Model log-likelihood #: Log-likelihood of fitted model (*float*) self.ll_model = ll_model #: Log-likelihood of null model (*float*) self.ll_null = ll_null # Extra statistics (not all regression models have these) #: Degrees of freedom for the residuals (*int*; *None* if N/A) self.dof_resid = dof_resid #: *R*:sup:`2` statistic (*float*; *None* if N/A) self.rsquared = rsquared #: *F* statistic (*float*; *None* if N/A) self.f_statistic = f_statistic #: Comments for the model (*List[str]*) self.comments = comments or [] # Config for display style #: See :func:`yli.regress` self.exp = exp @property def pseudo_rsquared(self): """McFadden's pseudo *R*:sup:`2` statistic""" return 1 - self.ll_model/self.ll_null def lrtest_null(self): """ Compute the likelihood ratio test comparing the model to the null model :rtype: :class:`LikelihoodRatioTestResult` """ statistic = -2 * (self.ll_null - self.ll_model) pvalue = 1 - stats.chi2.cdf(statistic, self.dof_model) return LikelihoodRatioTestResult(statistic, self.dof_model, pvalue) def ftest(self): """ Perform the *F* test that all slopes are 0 :rtype: :class:`yli.sig_tests.FTestResult` """ pvalue = 1 - stats.f(self.dof_model, self.dof_resid).cdf(self.f_statistic) return FTestResult(self.f_statistic, self.dof_model, self.dof_resid, pvalue) def bayesfactor_beta_zero(self, term): """ Compute a Bayes factor testing the hypothesis that the given beta is 0 Uses the R *BFpack* library. Requires the regression to be from statsmodels. The term must be specified as the *raw name* from the statsmodels regression, available via :attr:`RegressionResult.raw_result`. :param term: Raw name of the term to be tested :type term: str :rtype: :class:`yli.bayes_factors.BayesFactor` """ # FIXME: Allow specifying our renamed terms # Get parameters required for AFBF params = pd.Series({raw_name.replace('[', '_').replace(']', '_'): beta for raw_name, beta in self.raw_result.params.items()}) cov = self.raw_result.cov_params() # Compute BF matrix bf01 = bayesfactor_afbf(params, cov, self.nobs, '{} = 0'.format(term.replace('[', '_').replace(']', '_'))) bf01 = BayesFactor(bf01.factor, '0', '{} = 0'.format(term), '1', '{} ≠ 0'.format(term)) if bf01.factor >= 1: return bf01 else: return bf01.invert() def brant(self): """ Perform the Brant test for the parallel regression assumption in ordinal regression Applicable when the model was fitted using :class:`OrdinalLogit`. :rtype: :class:`BrantResult` **Example:** .. code-block:: df = pd.DataFrame(...) model = yli.regress(yli.OrdinalLogit, df, 'apply', 'pared + public + gpa', exp=False) model.brant() .. code-block:: text χ² df p Omnibus 4.34 3 0.23 pared 0.13 1 0.72 public 3.44 1 0.06 gpa 0.18 1 0.67 The output shows the result of the Brant test. For example, for the omnibus test of the parallel regression assumption across all independent variables, the *χ*:sup:`2` statistic is 4.34, the *χ*:sup:`2` distribution has 3 degrees of freedom, and the test is not significant, with *p* value 0.23. **Reference:** Brant R. Assessing proportionality in the proportional odds model for ordinal logistic regression. *Biometrics*. 1990;46(4):1171–8. `doi:10.2307/2532457 `_ """ df = self.df() if df is None: raise Exception('Referenced DataFrame has been dropped') dep = self.dep # Check for/clean NaNs # NaN warning/error will already have been handled in regress, so here we pass nan_policy='omit' # Following this, we pass nan_policy='raise' to assert no NaNs remaining df = df[[dep] + cols_for_formula(self.formula, df)] df = check_nan(df, 'omit') # Ensure numeric type for dependent variable df[dep], dep_categories = as_numeric(df[dep]) # Convert pandas nullable types for independent variables as this breaks statsmodels df = convert_pandas_nullable(df) # Precompute design matrix for RHS # This is also X+ in Brant paper dmatrix_right = patsy.dmatrix(self.formula, df, return_type='dataframe') dmatrix_right.reset_index(drop=True, inplace=True) # Otherwise this confuses matrix multiplication # Fit individual logistic regressions logit_models = [] for upper_limit in sorted(df[dep].unique())[:-1]: dep_dichotomous = (df[dep] <= upper_limit).astype(int).reset_index(drop=True) logit_result = sm.Logit(dep_dichotomous, dmatrix_right).fit(disp=False, **self.fit_kwargs) if not logit_result.mle_retvals['converged']: raise Exception('Maximum likelihood estimation failed to converge for {} <= {}. Check raw_result.mle_retvals.'.format(dep, upper_limit)) if pd.isna(logit_result.bse).any(): raise Exception('Regression returned NaN standard errors for {} <= {}.'.format(dep, upper_limit)) logit_models.append(logit_result) logit_betas = np.array([model._results.params for model in logit_models]).T logit_pihat = np.array([expit(-model.fittedvalues) for model in logit_models]).T # Predicted probabilities # vcov is the variance-covariance matrix of all individually fitted betas across all terms # | model 1 | model 2 | model 3 | ... # | term 1 | term 2 | term 1 | term 2 | term 1 | term 2 | ... # model 1 | term 1 | # | term 2 | # model 2 | term 1 | # | term 2 | # ... n_terms = len(dmatrix_right.columns) - 1 # number of beta terms (excluding intercept) n_betas = len(logit_models) * n_terms vcov = np.zeros((n_betas, n_betas)) # Populate the variance-covariance matrix for comparisons between individually fitted models for j in range(0, len(logit_models) - 1): for l in range(j + 1, len(logit_models)): Wjj = np.diag(logit_pihat[:,j] - logit_pihat[:,j] * logit_pihat[:,j]) Wjl = np.diag(logit_pihat[:,l] - logit_pihat[:,j] * logit_pihat[:,l]) Wll = np.diag(logit_pihat[:,l] - logit_pihat[:,l] * logit_pihat[:,l]) matrix_result = np.linalg.inv(dmatrix_right.T @ Wjj @ dmatrix_right) @ dmatrix_right.T @ Wjl @ dmatrix_right @ np.linalg.inv(dmatrix_right.T @ Wll @ dmatrix_right) j_vs_l_vcov = np.asarray(matrix_result)[1:,1:] # Asymptotic covariance for j,l vcov[j*n_terms:(j+1)*n_terms, l*n_terms:(l+1)*n_terms] = j_vs_l_vcov vcov[l*n_terms:(l+1)*n_terms, j*n_terms:(j+1)*n_terms] = j_vs_l_vcov # Populate the variance-covariance matrix within each individually fitted model for i in range(len(logit_models)): vcov[i*n_terms:(i+1)*n_terms, i*n_terms:(i+1)*n_terms] = logit_models[i]._results.cov_params()[1:,1:] # ------------------ # Perform Wald tests beta_names = ['{}_{}'.format(raw_name, i) for i in range(len(logit_models)) for raw_name in dmatrix_right.columns[1:]] wald_results = {} # Omnibus test constraints = [' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models))) for raw_name in dmatrix_right.columns[1:]] constraint = ', '.join(constraints) df = (len(logit_models) - 1) * (len(dmatrix_right.columns) - 1) # df = (number of levels minus 2) * (number of terms excluding intercept) wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, df) wald_results['Omnibus'] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue) # Individual terms for raw_name in dmatrix_right.columns[1:]: constraint = ' = '.join('{}_{}'.format(raw_name, i) for i in range(len(logit_models))) df = len(logit_models) - 1 # df = (number of levels minus 2) wald_result = _wald_test(beta_names, logit_betas[1:].ravel('F'), constraint, vcov, df) wald_results[raw_name] = ChiSquaredResult(wald_result.statistic, wald_result.df_denom, wald_result.pvalue) return BrantResult(wald_results) def bootstrap(self, samples=1000): """ Use bootstrapping to recompute confidence intervals and *p* values for the terms in the regression model :param samples: Number of bootstrap samples to draw :type samples: int :rtype: :class:`yli.regress.RegressionResult` """ df = self.df() if df is None: raise Exception('Referenced DataFrame has been dropped') dep = self.dep # Check for/clean NaNs # NaN warning/error will already have been handled in regress, so here we pass nan_policy='omit' # Following this, we pass nan_policy='raise' to assert no NaNs remaining df = df[[dep] + cols_for_formula(self.formula, df)] df = check_nan(df, 'omit') # Ensure numeric type for dependent variable df[dep], dep_categories = as_numeric(df[dep]) # Convert pandas nullable types for independent variables as this breaks statsmodels df = convert_pandas_nullable(df) # Precompute design matrices dmatrices = patsy.dmatrices(dep + ' ~ ' + self.formula, df, return_type='dataframe') # Fit full model #full_model = regress(self.model_class, df, dep, self.formula, nan_policy='raise', _dmatrices=dmatrices, model_kwargs=self.model_kwargs, fit_kwargs=self.fit_kwargs) # Initialise bootstrap_results bootstrap_results = {} # Dict mapping term raw names to bootstrap betas for term in self.terms.values(): if isinstance(term, SingleTerm): bootstrap_results[term.raw_name] = [] else: for sub_term in term.categories.values(): bootstrap_results[sub_term.raw_name] = [] # Draw bootstrap samples and regress dmatrices = dmatrices[0].join(dmatrices[1]) for i in tqdm(range(samples)): bootstrap_rows = dmatrices.sample(len(df), replace=True) model = self.model_class(endog=bootstrap_rows.iloc[:,0], exog=bootstrap_rows.iloc[:,1:], **self.model_kwargs) model.formula = dep + ' ~ ' + self.formula result = model.fit(disp=False, **self.fit_kwargs) for raw_name, raw_beta in zip(model.exog_names, result._results.params): bootstrap_results[raw_name].append(raw_beta) # Combine bootstrap results terms = {} for term_name, term in self.terms.items(): if isinstance(term, SingleTerm): bootstrap_betas = bootstrap_results[term.raw_name] bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas) bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue) terms[term_name] = SingleTerm(term.raw_name, Estimate(term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue) else: categories = {} for sub_term_name, sub_term in term.categories.items(): bootstrap_betas = bootstrap_results[sub_term.raw_name] bootstrap_pvalue = sum(1 for b in bootstrap_betas if b < 0) / len(bootstrap_betas) bootstrap_pvalue = 2 * min(bootstrap_pvalue, 1 - bootstrap_pvalue) categories[sub_term_name] = SingleTerm(sub_term.raw_name, Estimate(sub_term.beta.point, np.quantile(bootstrap_betas, config.alpha/2), np.quantile(bootstrap_betas, 1-config.alpha/2)), bootstrap_pvalue) terms[term_name] = CategoricalTerm(categories, term.ref_category) return RegressionResult( self.model_class, self.df, dep, self.formula, self.nan_policy, self.model_kwargs, self.fit_kwargs, None, self.full_name, self.model_name, self.fit_method, self.nobs, None, self.dof_model, datetime.now(), 'Bootstrap', terms, self.ll_model, self.ll_null, self.dof_resid, self.rsquared, self.f_statistic, self.comments, self.exp ) def shap(self, **kwargs): """ Compute SHAP values for the model Uses the Python *shap* library. :param kwargs: Keyword arguments to pass to *shap.LinearExplainer* :rtype: :class:`yli.shap.ShapResult` **Reference:** Lundberg SM, Lee SI. A unified approach to interpreting model predictions. In: Guyon I, Von Luxburg U, Bengio S, et al., editors. *Advances in Neural Information Processing Systems*; 2017 Dec 4–9; Long Beach, CA. https://proceedings.neurips.cc/paper/2017/hash/8a20a8621978632d76c43dfd28b67767-Abstract.html """ import shap xdata = ShapResult._get_xdata(self) # Combine terms into single list params = [] for term in self.terms.values(): if isinstance(term, SingleTerm): params.append(term.beta.point) else: params.extend(s.beta.point for s in term.categories.values()) explainer = shap.LinearExplainer((np.array(params[1:]), params[0]), xdata, **kwargs) # FIXME: Assumes zeroth term is intercept shap_values = explainer.shap_values(xdata).astype('float') return ShapResult(weakref.ref(self), shap_values, list(xdata.columns)) def _header_table(self, html): """Return the entries for the header table""" # Left column left_col = [] left_col.append(('Dep. Variable:', self.dep)) left_col.append(('Model:', self.model_name)) left_col.append(('Method:', self.fit_method)) left_col.append(('Date:', self.fitted_dt.strftime('%Y-%m-%d'))) left_col.append(('Time:', self.fitted_dt.strftime('%H:%M:%S'))) if self.cov_type: left_col.append(('Std. Errors:', 'Non-Robust' if self.cov_type == 'nonrobust' else self.cov_type.upper() if self.cov_type.startswith('hc') else self.cov_type)) # Right column right_col = [] right_col.append(('No. Observations:', format(self.nobs, '.0f'))) if self.nevents: right_col.append(('No. Events:', format(self.nevents, '.0f'))) right_col.append(('Df. Model:', format(self.dof_model, '.0f'))) if self.dof_resid: right_col.append(('Df. Residuals:', format(self.dof_resid, '.0f'))) if self.rsquared: right_col.append(('R²:' if html else 'R²:', format(self.rsquared, '.2f'))) elif self.ll_null: right_col.append(('Pseudo R²:' if html else 'Pseudo R²:', format(self.pseudo_rsquared, '.2f'))) if self.f_statistic: # Report the F test if available f_result = self.ftest() if html: right_col.append(('F:', format(f_result.statistic, '.2f'))) right_col.append(('p (F):', fmt_p(f_result.pvalue, PValueStyle.VALUE_ONLY | PValueStyle.HTML))) else: right_col.append(('F:', format(f_result.statistic, '.2f'))) right_col.append(('p (F):', fmt_p(f_result.pvalue, PValueStyle.VALUE_ONLY))) else: # Otherwise report likelihood ratio test as overall test right_col.append(('LL-Model:', format(self.ll_model, '.2f'))) if self.ll_null: lrtest_result = self.lrtest_null() right_col.append(('LL-Null:', format(self.ll_null, '.2f'))) if html: right_col.append(('p (LR):', fmt_p(lrtest_result.pvalue, PValueStyle.VALUE_ONLY | PValueStyle.HTML))) else: right_col.append(('p (LR):', fmt_p(lrtest_result.pvalue, PValueStyle.VALUE_ONLY))) return left_col, right_col def __repr__(self): if config.repr_is_summary: return self.summary() return super().__repr__() def _repr_html_(self): # Render header table left_col, right_col = self._header_table(html=True) out = ''.format(self.full_name) for left_cell, right_cell in itertools.zip_longest(left_col, right_col): out += ''.format( left_cell[0] if left_cell else '', left_cell[1] if left_cell else '', right_cell[0] if right_cell else '', right_cell[1] if right_cell else '' ) out += '

{} Results
{}	{}	{}	{}

' # Render coefficients table out += ''.format('exp(β)' if self.exp else 'β', (1-config.alpha)*100) for term_name, term in self.terms.items(): if isinstance(term, SingleTerm): # Single term # Exponentiate beta if requested beta = term.beta if self.exp: beta = np.exp(beta) out += ''.format(term_name, beta.point, beta.ci_lower, beta.ci_upper, fmt_p(term.pvalue, PValueStyle.TABULAR | PValueStyle.HTML)) elif isinstance(term, CategoricalTerm): # Categorical term out += ''.format(term_name) # Render reference category if term.ref_category is not None: out += ''.format(term.ref_category) # Loop over terms for sub_term_name, sub_term in term.categories.items(): # Exponentiate beta if requested beta = sub_term.beta if self.exp: beta = np.exp(beta) out += ''.format(sub_term_name, beta.point, beta.ci_lower, beta.ci_upper, fmt_p(sub_term.pvalue, PValueStyle.TABULAR | PValueStyle.HTML)) else: raise Exception('Attempt to render unknown term type') out += '

	{}	({:g}% CI)			p
{}	{:.2f}	({:.2f}	–	{:.2f})	{}
{}
{}	Ref.
{}	{:.2f}	({:.2f}	–	{:.2f})	{}

' # TODO: Have a detailed view which shows SE, t/z, etc. if self.comments: out += '

{}

' return out def summary(self): """ Return a stringified summary of the regression model :rtype: str """ # Render header table left_col, right_col = self._header_table(html=False) # Ensure equal sizes for SimpleTable if len(right_col) > len(left_col): left_col.extend([('', '')] * (len(right_col) - len(left_col))) elif len(left_col) > len(right_col): right_col.extend([('', '')] * (len(left_col) - len(right_col))) table1 = SimpleTable(np.concatenate([left_col, right_col], axis=1), title='{} Results'.format(self.full_name)) table1.insert_stubs(2, [' | '] * len(left_col)) # Get rid of last line (merge with next table) table1_lines = table1.as_text().splitlines(keepends=False) out = '\n'.join(table1_lines[:-1]) + '\n' # Render coefficients table table_data = [] for term_name, term in self.terms.items(): if isinstance(term, SingleTerm): # Single term # Exponentiate beta if requested beta = term.beta if self.exp: beta = np.exp(beta) # Add some extra padding table_data.append([term_name + ' ', format(beta.point, '.2f'), '({:.2f}'.format(beta.ci_lower), '-', '{:.2f})'.format(beta.ci_upper), ' ' + fmt_p(term.pvalue, PValueStyle.TABULAR)]) elif isinstance(term, CategoricalTerm): # Categorical term table_data.append([term_name + ' ', '', '', '', '', '']) # Render reference category if term.ref_category is not None: table_data.append(['{} '.format(term.ref_category), 'Ref.', '', '', '', '']) # Loop over terms for sub_term_name, sub_term in term.categories.items(): # Exponentiate beta if requested beta = sub_term.beta if self.exp: beta = np.exp(beta) table_data.append([sub_term_name + ' ', format(beta.point, '.2f'), '({:.2f}'.format(beta.ci_lower), '-', '{:.2f})'.format(beta.ci_upper), ' ' + fmt_p(sub_term.pvalue, PValueStyle.TABULAR)]) else: raise Exception('Attempt to render unknown term type') table2 = SimpleTable(data=table_data, headers=['', 'exp(β)' if self.exp else 'β', '', '\ue000', '', ' p']) # U+E000 is in Private Use Area, mark middle of CI column table2_text = table2.as_text().replace(' \ue000 ', '({:g}% CI)'.format((1-config.alpha)*100)) # Render heading in the right spot table2_lines = table2_text.splitlines(keepends=False) # Render divider line between 2 tables max_table_len = max(len(table1_lines[-1]), len(table2_lines[-1])) out += '=' * max_table_len + '\n' out += '\n'.join(table2_lines[1:]) if self.comments: out += '\n' for i, comment in enumerate(self.comments): out += '\n{}. {}'.format(i + 1, comment) return out class SingleTerm: """A term in a :class:`RegressionResult` which is a single term""" def __init__(self, raw_name, beta, pvalue): #: Raw name of the term (*str*; e.g. in :attr:`RegressionResult.raw_result`) self.raw_name = raw_name #: :class:`yli.utils.Estimate` of the coefficient self.beta = beta #: *p* value for the coefficient (*float*) self.pvalue = pvalue class CategoricalTerm: """A group of terms in a :class:`RegressionResult` corresponding to a categorical variable""" def __init__(self, categories, ref_category): #: Terms for each of the categories, excluding the reference category (*dict* of :class:`SingleTerm`) self.categories = categories #: Name of the reference category (*str*) self.ref_category = ref_category def regress( model_class, df, dep, formula, *, nan_policy='warn', model_kwargs=None, fit_kwargs=None, family=None, exposure=None, status=None, # common model_kwargs cov_type=None, method=None, maxiter=None, start_params=None, # common fit_kwargs bool_baselevels=False, exp=None, _dmatrices=None, ): """ Fit a statsmodels regression model :param model_class: Type of regression model to fit :type model_class: statsmodels model class :param df: Data to perform regression on :type df: DataFrame :param dep: Column in *df* for the dependent variable (numeric) :type dep: str :param formula: Patsy formula for the regression model :type formula: str :param exposure: Column in *df* for the exposure variable (numeric, some models only) :type exposure: str :param status: Column in *df* for the status variable (True/False or 1/0, time-to-event models only) :type status: str :param nan_policy: How to handle *nan* values (see :ref:`nan-handling`) :type nan_policy: str :param model_kwargs: Keyword arguments to pass to *model_class* constructor :type model_kwargs: dict :param fit_kwargs: Keyword arguments to pass to *model.fit* :type fit_kwargs: dict :param family: See statsmodels *GLM* constructor :param cov_type: See statsmodels *model.fit* :param method: See statsmodels *model.fit* :param maxiter: See statsmodels *model.fit* :param start_params: See statsmodels *model.fit* :param bool_baselevels: Show reference categories for boolean independent variables even if reference category is *False* :type bool_baselevels: bool :param exp: Report exponentiated parameters rather than raw parameters, default (*None*) is to autodetect based on *model_class* :type exp: bool :rtype: :class:`yli.regress.RegressionResult` **Example:** .. code-block:: df = pd.DataFrame({ 'Unhealthy': [False, False, False, ...], 'Fibrinogen': [2.52, 2.46, 2.29, ...], 'GammaGlobulin': [38, 36, 36, ...] }) yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin') .. code-block:: text Logistic Regression Results ====================================================== Dep. Variable: Unhealthy | No. Observations: 32 Model: Logit | Df. Model: 2 Method: MLE | Df. Residuals: 29 Date: 2022-10-18 | Pseudo R²: 0.26 Time: 19:00:34 | LL-Model: -11.47 Std. Errors: Non-Robust | LL-Null: -15.44 | p (LR): 0.02* ====================================================== exp(β) (95% CI) p ----------------------------------------------- (Intercept) 0.00 (0.00 - 0.24) 0.03* Fibrinogen 6.80 (1.01 - 45.79) 0.049* GammaGlobulin 1.17 (0.92 - 1.48) 0.19 ----------------------------------------------- The output summarises the results of the regression. Note that the parameter estimates are automatically exponentiated. For example, the odds ratio for unhealthiness per unit increase in fibrinogen is 6.80, with 95% confidence interval 1.01–45.79, and is significant with *p* value 0.049. """ # Populate model_kwargs if model_kwargs is None: model_kwargs = {} if family is not None: model_kwargs['family'] = family # Populate fit_kwargs if fit_kwargs is None: fit_kwargs = {} if cov_type is not None: fit_kwargs['cov_type'] = cov_type if method is not None: fit_kwargs['method'] = method if maxiter is not None: fit_kwargs['maxiter'] = maxiter if start_params is not None: fit_kwargs['start_params'] = start_params # Autodetect whether to exponentiate if exp is None: if model_class in (sm.Logit, sm.PHReg, sm.Poisson, OrdinalLogit, PenalisedLogit): exp = True else: exp = False df_ref = weakref.ref(df) if _dmatrices is None: # Check for/clean NaNs in input columns columns = [dep] + cols_for_formula(formula, df) if exposure is not None: columns.append(exposure) if status is not None: columns.append(status) df = df[columns] df = check_nan(df, nan_policy) # Ensure numeric type for dependent variable df[dep], dep_categories = as_numeric(df[dep]) # Convert pandas nullable types for independent variables as this breaks statsmodels df = convert_pandas_nullable(df) # Construct design matrix from formula dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe') else: dmatrices = _dmatrices if model_class in (sm.PHReg, OrdinalLogit): # Drop explicit intercept term # FIXME: Check before dropping dmatrices = (dmatrices[0], dmatrices[1].iloc[:,1:]) # Add exposure to model if exposure is not None: if df[exposure].dtype == '