Add support for statsmodels PHReg to yli.regress

This commit is contained in:
RunasSudo 2023-02-25 14:46:13 +11:00
parent fa95f6d75c
commit eb0d520d95
Signed by: RunasSudo
GPG Key ID: 7234E476BF21C61A
1 changed files with 51 additions and 22 deletions

View File

@ -129,7 +129,7 @@ class RegressionResult:
model_class, df, dep, formula, nan_policy, model_kwargs, fit_kwargs,
raw_result,
full_name, model_name, fit_method,
nobs, dof_model, fitted_dt, cov_type,
nobs, nevents, dof_model, fitted_dt, cov_type,
terms,
ll_model, ll_null,
dof_resid, rsquared, f_statistic,
@ -166,6 +166,8 @@ class RegressionResult:
# Basic fitted model information
#: Number of observations (*int*)
self.nobs = nobs
#: Number of events (*int*, time-to-event models only)
self.nevents = nevents
#: Degrees of freedom for the model (*int*)
self.dof_model = dof_model
#: Date and time of fitting the model (Python *datetime*)
@ -453,7 +455,7 @@ class RegressionResult:
self.model_class, self.df, dep, self.formula, self.nan_policy, self.model_kwargs, self.fit_kwargs,
None,
self.full_name, self.model_name, self.fit_method,
self.nobs, self.dof_model, datetime.now(), 'Bootstrap',
self.nobs, None, self.dof_model, datetime.now(), 'Bootstrap',
terms,
self.ll_model, self.ll_null,
self.dof_resid, self.rsquared, self.f_statistic,
@ -499,12 +501,14 @@ class RegressionResult:
right_col = []
right_col.append(('No. Observations:', format(self.nobs, '.0f')))
if self.nevents:
right_col.append(('No. Events:', format(self.nevents, '.0f')))
right_col.append(('Df. Model:', format(self.dof_model, '.0f')))
if self.dof_resid:
right_col.append(('Df. Residuals:', format(self.dof_resid, '.0f')))
if self.rsquared:
right_col.append(('<i>R</i><sup>2</sup>:' if html else 'R²:', format(self.rsquared, '.2f')))
else:
elif self.ll_null:
right_col.append(('Pseudo <i>R</i><sup>2</sup>:' if html else 'Pseudo R²:', format(self.pseudo_rsquared, '.2f')))
if self.f_statistic:
# Report the F test if available
@ -516,7 +520,7 @@ class RegressionResult:
else:
right_col.append(('F:', format(f_result.statistic, '.2f')))
right_col.append(('p (F):', fmt_p(f_result.pvalue, PValueStyle.VALUE_ONLY)))
else:
elif self.ll_null:
# Otherwise report likelihood ratio test as overall test
lrtest_result = self.lrtest_null()
@ -689,7 +693,7 @@ def regress(
model_class, df, dep, formula, *,
nan_policy='warn',
model_kwargs=None, fit_kwargs=None,
family=None, exposure=None, # common model_kwargs
family=None, exposure=None, status=None, # common model_kwargs
cov_type=None, method=None, maxiter=None, start_params=None, # common fit_kwargs
bool_baselevels=False, exp=None,
_dmatrices=None,
@ -707,6 +711,8 @@ def regress(
:type formula: str
:param exposure: Column in *df* for the exposure variable (numeric, some models only)
:type exposure: str
:param status: Column in *df* for the status variable (time-to-event models only)
:type status: str
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
:param model_kwargs: Keyword arguments to pass to *model_class* constructor
@ -780,9 +786,7 @@ def regress(
# Autodetect whether to exponentiate
if exp is None:
if model_class in (sm.Logit, sm.Poisson, PenalisedLogit):
exp = True
elif model_class is OrdinalLogit:
if model_class in (sm.Logit, sm.PHReg, sm.Poisson, OrdinalLogit, PenalisedLogit):
exp = True
else:
exp = False
@ -790,11 +794,14 @@ def regress(
df_ref = weakref.ref(df)
if _dmatrices is None:
# Check for/clean NaNs
if exposure is None:
df = df[[dep] + cols_for_formula(formula, df)]
else:
df = df[[dep, exposure] + cols_for_formula(formula, df)]
# Check for/clean NaNs in input columns
columns = [dep] + cols_for_formula(formula, df)
if exposure is not None:
columns.append(exposure)
if status is not None:
columns.append(status)
df = df[columns]
df = check_nan(df, nan_policy)
# Ensure numeric type for dependent variable
@ -808,17 +815,22 @@ def regress(
else:
dmatrices = _dmatrices
if model_class is OrdinalLogit:
if model_class in (sm.PHReg, OrdinalLogit):
# Drop explicit intercept term
# FIXME: Check before dropping
dmatrices = (dmatrices[0], dmatrices[1].iloc[:,1:])
# Add exposure to model
if exposure is not None:
if df[exposure].dtype == '<m8[ns]':
model_kwargs['exposure'] = df[exposure].dt.total_seconds()
else:
model_kwargs['exposure'] = df[exposure]
# Add status to model
if status is not None:
model_kwargs['status'] = df[status]
# Fit model
model = model_class(endog=dmatrices[0], exog=dmatrices[1], **model_kwargs)
model.formula = dep + ' ~ ' + formula
@ -845,11 +857,19 @@ def regress(
# Join term names manually because statsmodels wrapper is very slow
#confint = result.conf_int(config.alpha)
confint = {k: v for k, v in zip(model.exog_names, result._results.conf_int(config.alpha))}
pvalues = {k: v for k, v in zip(model.exog_names, result._results.pvalues)}
if hasattr(result, '_results'):
confint = {k: v for k, v in zip(model.exog_names, result._results.conf_int(config.alpha))}
pvalues = {k: v for k, v in zip(model.exog_names, result._results.pvalues)}
params = result._results.params
else:
# e.g. PHReg
confint = {k: v for k, v in zip(model.exog_names, result.conf_int(config.alpha))}
pvalues = {k: v for k, v in zip(model.exog_names, result.pvalues)}
params = result.params
#for raw_name, raw_beta in result.params.items():
for raw_name, raw_beta in zip(model.exog_names, result._results.params):
for raw_name, raw_beta in zip(model.exog_names, params):
beta = Estimate(raw_beta, confint[raw_name][0], confint[raw_name][1])
# Rename terms
@ -896,6 +916,8 @@ def regress(
ll_null = result.ll_null
elif hasattr(result, 'llnull'):
ll_null = result.llnull
elif model_class is sm.PHReg:
ll_null = None
else:
# Construct null (intercept-only) model
#result_null = model_class.from_formula(formula=dep + ' ~ 1', data=df).fit()
@ -908,6 +930,8 @@ def regress(
if model_class is sm.OLS:
method_name = 'Least Squares'
elif model_class is sm.PHReg:
method_name = 'MPLE'
else:
# Parse raw regression results to get fit method
# Avoid this in general as it can be expensive to summarise all the post hoc tests, etc.
@ -915,21 +939,26 @@ def regress(
header_dict = {x[0]: x[1] for x in header_entries}
method_name = header_dict['Method:']
# Get full name to display
# Get names to display
if model_class is sm.PHReg:
short_name = 'Cox'
else:
short_name = model_class.__name__
if model_class is sm.Logit:
full_name = 'Logistic Regression'
elif model_class is OrdinalLogit:
full_name = 'Ordinal Logistic Regression'
else:
full_name = '{} Regression'.format(model_class.__name__)
full_name = '{} Regression'.format(short_name)
if fit_kwargs.get('cov_type', 'nonrobust') != 'nonrobust':
full_name = 'Robust {}'.format(full_name)
return RegressionResult(
model_class, df_ref, dep, formula, nan_policy, model_kwargs, fit_kwargs,
result,
full_name, model_class.__name__, method_name,
result.nobs, result.df_model, datetime.now(), getattr(result, 'cov_type', 'nonrobust'),
full_name, short_name, method_name,
getattr(result, 'nobs', len(df)), df[status].sum() if model_class is sm.PHReg else None, result.df_model, datetime.now(), getattr(result, 'cov_type', 'nonrobust'),
terms,
result.llf, ll_null,
getattr(result, 'df_resid', None), getattr(result, 'rsquared', None), getattr(result, 'fvalue', None),
@ -1048,7 +1077,7 @@ class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel):
None, None, None, None, None, None, None, # Set in regress()
model,
'Penalised Logistic Regression', 'Logit', 'Penalised ML',
model['n'][0], model['df'][0], datetime.now(), 'nonrobust',
model['n'][0], None, model['df'][0], datetime.now(), 'nonrobust',
terms,
model['loglik'][0], model['loglik'][1],
None, None, None,