Add support for statsmodels PHReg to yli.regress
This commit is contained in:
parent
fa95f6d75c
commit
eb0d520d95
@ -129,7 +129,7 @@ class RegressionResult:
|
||||
model_class, df, dep, formula, nan_policy, model_kwargs, fit_kwargs,
|
||||
raw_result,
|
||||
full_name, model_name, fit_method,
|
||||
nobs, dof_model, fitted_dt, cov_type,
|
||||
nobs, nevents, dof_model, fitted_dt, cov_type,
|
||||
terms,
|
||||
ll_model, ll_null,
|
||||
dof_resid, rsquared, f_statistic,
|
||||
@ -166,6 +166,8 @@ class RegressionResult:
|
||||
# Basic fitted model information
|
||||
#: Number of observations (*int*)
|
||||
self.nobs = nobs
|
||||
#: Number of events (*int*, time-to-event models only)
|
||||
self.nevents = nevents
|
||||
#: Degrees of freedom for the model (*int*)
|
||||
self.dof_model = dof_model
|
||||
#: Date and time of fitting the model (Python *datetime*)
|
||||
@ -453,7 +455,7 @@ class RegressionResult:
|
||||
self.model_class, self.df, dep, self.formula, self.nan_policy, self.model_kwargs, self.fit_kwargs,
|
||||
None,
|
||||
self.full_name, self.model_name, self.fit_method,
|
||||
self.nobs, self.dof_model, datetime.now(), 'Bootstrap',
|
||||
self.nobs, None, self.dof_model, datetime.now(), 'Bootstrap',
|
||||
terms,
|
||||
self.ll_model, self.ll_null,
|
||||
self.dof_resid, self.rsquared, self.f_statistic,
|
||||
@ -499,12 +501,14 @@ class RegressionResult:
|
||||
right_col = []
|
||||
|
||||
right_col.append(('No. Observations:', format(self.nobs, '.0f')))
|
||||
if self.nevents:
|
||||
right_col.append(('No. Events:', format(self.nevents, '.0f')))
|
||||
right_col.append(('Df. Model:', format(self.dof_model, '.0f')))
|
||||
if self.dof_resid:
|
||||
right_col.append(('Df. Residuals:', format(self.dof_resid, '.0f')))
|
||||
if self.rsquared:
|
||||
right_col.append(('<i>R</i><sup>2</sup>:' if html else 'R²:', format(self.rsquared, '.2f')))
|
||||
else:
|
||||
elif self.ll_null:
|
||||
right_col.append(('Pseudo <i>R</i><sup>2</sup>:' if html else 'Pseudo R²:', format(self.pseudo_rsquared, '.2f')))
|
||||
if self.f_statistic:
|
||||
# Report the F test if available
|
||||
@ -516,7 +520,7 @@ class RegressionResult:
|
||||
else:
|
||||
right_col.append(('F:', format(f_result.statistic, '.2f')))
|
||||
right_col.append(('p (F):', fmt_p(f_result.pvalue, PValueStyle.VALUE_ONLY)))
|
||||
else:
|
||||
elif self.ll_null:
|
||||
# Otherwise report likelihood ratio test as overall test
|
||||
lrtest_result = self.lrtest_null()
|
||||
|
||||
@ -689,7 +693,7 @@ def regress(
|
||||
model_class, df, dep, formula, *,
|
||||
nan_policy='warn',
|
||||
model_kwargs=None, fit_kwargs=None,
|
||||
family=None, exposure=None, # common model_kwargs
|
||||
family=None, exposure=None, status=None, # common model_kwargs
|
||||
cov_type=None, method=None, maxiter=None, start_params=None, # common fit_kwargs
|
||||
bool_baselevels=False, exp=None,
|
||||
_dmatrices=None,
|
||||
@ -707,6 +711,8 @@ def regress(
|
||||
:type formula: str
|
||||
:param exposure: Column in *df* for the exposure variable (numeric, some models only)
|
||||
:type exposure: str
|
||||
:param status: Column in *df* for the status variable (time-to-event models only)
|
||||
:type status: str
|
||||
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
|
||||
:type nan_policy: str
|
||||
:param model_kwargs: Keyword arguments to pass to *model_class* constructor
|
||||
@ -780,9 +786,7 @@ def regress(
|
||||
|
||||
# Autodetect whether to exponentiate
|
||||
if exp is None:
|
||||
if model_class in (sm.Logit, sm.Poisson, PenalisedLogit):
|
||||
exp = True
|
||||
elif model_class is OrdinalLogit:
|
||||
if model_class in (sm.Logit, sm.PHReg, sm.Poisson, OrdinalLogit, PenalisedLogit):
|
||||
exp = True
|
||||
else:
|
||||
exp = False
|
||||
@ -790,11 +794,14 @@ def regress(
|
||||
df_ref = weakref.ref(df)
|
||||
|
||||
if _dmatrices is None:
|
||||
# Check for/clean NaNs
|
||||
if exposure is None:
|
||||
df = df[[dep] + cols_for_formula(formula, df)]
|
||||
else:
|
||||
df = df[[dep, exposure] + cols_for_formula(formula, df)]
|
||||
# Check for/clean NaNs in input columns
|
||||
columns = [dep] + cols_for_formula(formula, df)
|
||||
if exposure is not None:
|
||||
columns.append(exposure)
|
||||
if status is not None:
|
||||
columns.append(status)
|
||||
|
||||
df = df[columns]
|
||||
df = check_nan(df, nan_policy)
|
||||
|
||||
# Ensure numeric type for dependent variable
|
||||
@ -808,17 +815,22 @@ def regress(
|
||||
else:
|
||||
dmatrices = _dmatrices
|
||||
|
||||
if model_class is OrdinalLogit:
|
||||
if model_class in (sm.PHReg, OrdinalLogit):
|
||||
# Drop explicit intercept term
|
||||
# FIXME: Check before dropping
|
||||
dmatrices = (dmatrices[0], dmatrices[1].iloc[:,1:])
|
||||
|
||||
# Add exposure to model
|
||||
if exposure is not None:
|
||||
if df[exposure].dtype == '<m8[ns]':
|
||||
model_kwargs['exposure'] = df[exposure].dt.total_seconds()
|
||||
else:
|
||||
model_kwargs['exposure'] = df[exposure]
|
||||
|
||||
# Add status to model
|
||||
if status is not None:
|
||||
model_kwargs['status'] = df[status]
|
||||
|
||||
# Fit model
|
||||
model = model_class(endog=dmatrices[0], exog=dmatrices[1], **model_kwargs)
|
||||
model.formula = dep + ' ~ ' + formula
|
||||
@ -845,11 +857,19 @@ def regress(
|
||||
|
||||
# Join term names manually because statsmodels wrapper is very slow
|
||||
#confint = result.conf_int(config.alpha)
|
||||
confint = {k: v for k, v in zip(model.exog_names, result._results.conf_int(config.alpha))}
|
||||
pvalues = {k: v for k, v in zip(model.exog_names, result._results.pvalues)}
|
||||
|
||||
if hasattr(result, '_results'):
|
||||
confint = {k: v for k, v in zip(model.exog_names, result._results.conf_int(config.alpha))}
|
||||
pvalues = {k: v for k, v in zip(model.exog_names, result._results.pvalues)}
|
||||
params = result._results.params
|
||||
else:
|
||||
# e.g. PHReg
|
||||
confint = {k: v for k, v in zip(model.exog_names, result.conf_int(config.alpha))}
|
||||
pvalues = {k: v for k, v in zip(model.exog_names, result.pvalues)}
|
||||
params = result.params
|
||||
|
||||
#for raw_name, raw_beta in result.params.items():
|
||||
for raw_name, raw_beta in zip(model.exog_names, result._results.params):
|
||||
for raw_name, raw_beta in zip(model.exog_names, params):
|
||||
beta = Estimate(raw_beta, confint[raw_name][0], confint[raw_name][1])
|
||||
|
||||
# Rename terms
|
||||
@ -896,6 +916,8 @@ def regress(
|
||||
ll_null = result.ll_null
|
||||
elif hasattr(result, 'llnull'):
|
||||
ll_null = result.llnull
|
||||
elif model_class is sm.PHReg:
|
||||
ll_null = None
|
||||
else:
|
||||
# Construct null (intercept-only) model
|
||||
#result_null = model_class.from_formula(formula=dep + ' ~ 1', data=df).fit()
|
||||
@ -908,6 +930,8 @@ def regress(
|
||||
|
||||
if model_class is sm.OLS:
|
||||
method_name = 'Least Squares'
|
||||
elif model_class is sm.PHReg:
|
||||
method_name = 'MPLE'
|
||||
else:
|
||||
# Parse raw regression results to get fit method
|
||||
# Avoid this in general as it can be expensive to summarise all the post hoc tests, etc.
|
||||
@ -915,21 +939,26 @@ def regress(
|
||||
header_dict = {x[0]: x[1] for x in header_entries}
|
||||
method_name = header_dict['Method:']
|
||||
|
||||
# Get full name to display
|
||||
# Get names to display
|
||||
if model_class is sm.PHReg:
|
||||
short_name = 'Cox'
|
||||
else:
|
||||
short_name = model_class.__name__
|
||||
|
||||
if model_class is sm.Logit:
|
||||
full_name = 'Logistic Regression'
|
||||
elif model_class is OrdinalLogit:
|
||||
full_name = 'Ordinal Logistic Regression'
|
||||
else:
|
||||
full_name = '{} Regression'.format(model_class.__name__)
|
||||
full_name = '{} Regression'.format(short_name)
|
||||
if fit_kwargs.get('cov_type', 'nonrobust') != 'nonrobust':
|
||||
full_name = 'Robust {}'.format(full_name)
|
||||
|
||||
return RegressionResult(
|
||||
model_class, df_ref, dep, formula, nan_policy, model_kwargs, fit_kwargs,
|
||||
result,
|
||||
full_name, model_class.__name__, method_name,
|
||||
result.nobs, result.df_model, datetime.now(), getattr(result, 'cov_type', 'nonrobust'),
|
||||
full_name, short_name, method_name,
|
||||
getattr(result, 'nobs', len(df)), df[status].sum() if model_class is sm.PHReg else None, result.df_model, datetime.now(), getattr(result, 'cov_type', 'nonrobust'),
|
||||
terms,
|
||||
result.llf, ll_null,
|
||||
getattr(result, 'df_resid', None), getattr(result, 'rsquared', None), getattr(result, 'fvalue', None),
|
||||
@ -1048,7 +1077,7 @@ class PenalisedLogit(statsmodels.discrete.discrete_model.BinaryModel):
|
||||
None, None, None, None, None, None, None, # Set in regress()
|
||||
model,
|
||||
'Penalised Logistic Regression', 'Logit', 'Penalised ML',
|
||||
model['n'][0], model['df'][0], datetime.now(), 'nonrobust',
|
||||
model['n'][0], None, model['df'][0], datetime.now(), 'nonrobust',
|
||||
terms,
|
||||
model['loglik'][0], model['loglik'][1],
|
||||
None, None, None,
|
||||
|
Loading…
Reference in New Issue
Block a user