Implement yli.Cox
This commit is contained in:
parent
d17412ca07
commit
fd7384f810
@ -20,7 +20,7 @@ from .descriptives import auto_correlations, auto_descriptives
|
|||||||
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
|
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
|
||||||
from .graphs import init_fonts, HorizontalEffectPlot
|
from .graphs import init_fonts, HorizontalEffectPlot
|
||||||
from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
|
from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
|
||||||
from .regress import GLM, IntervalCensoredCox, Logit, OLS, OrdinalLogit, PenalisedLogit, Poisson, regress, vif
|
from .regress import Cox, GLM, IntervalCensoredCox, Logit, OLS, OrdinalLogit, PenalisedLogit, Poisson, regress, vif
|
||||||
from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, spearman, ttest_ind, ttest_ind_multiple
|
from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, spearman, ttest_ind, ttest_ind_multiple
|
||||||
from .survival import kaplanmeier, logrank, turnbull
|
from .survival import kaplanmeier, logrank, turnbull
|
||||||
from .utils import as_ordinal
|
from .utils import as_ordinal
|
||||||
|
@ -101,7 +101,7 @@ def regress(
|
|||||||
model_class, df, dep, formula,
|
model_class, df, dep, formula,
|
||||||
*,
|
*,
|
||||||
nan_policy='warn',
|
nan_policy='warn',
|
||||||
exposure=None, family=None,
|
exposure=None, family=None, status=None,
|
||||||
method=None, maxiter=None, start_params=None, tolerance=None,
|
method=None, maxiter=None, start_params=None, tolerance=None,
|
||||||
reduced=None,
|
reduced=None,
|
||||||
bool_baselevels=False, exp=None
|
bool_baselevels=False, exp=None
|
||||||
@ -123,6 +123,8 @@ def regress(
|
|||||||
:type exposure: str
|
:type exposure: str
|
||||||
:param family: See :class:`yli.GLM`
|
:param family: See :class:`yli.GLM`
|
||||||
:type family: str
|
:type family: str
|
||||||
|
:param status: See :class:`yli.Cox`
|
||||||
|
:type status: str
|
||||||
:param method: See statsmodels *model.fit*
|
:param method: See statsmodels *model.fit*
|
||||||
:param maxiter: See statsmodels *model.fit*
|
:param maxiter: See statsmodels *model.fit*
|
||||||
:param start_params: See statsmodels *model.fit*
|
:param start_params: See statsmodels *model.fit*
|
||||||
@ -141,15 +143,17 @@ def regress(
|
|||||||
if not any(x.__name__ == 'RegressionModel' for x in model_class.__bases__):
|
if not any(x.__name__ == 'RegressionModel' for x in model_class.__bases__):
|
||||||
raise ValueError('model_class must be a RegressionModel')
|
raise ValueError('model_class must be a RegressionModel')
|
||||||
|
|
||||||
df_ref = weakref.ref(df)
|
# Additional columns to check for NaN - exposure, status, etc.
|
||||||
dmatrices, dep_categories = df_to_dmatrices(df, dep, formula, nan_policy)
|
additional_columns = []
|
||||||
|
|
||||||
# Build function call arguments
|
# Build function call arguments
|
||||||
fit_kwargs = {}
|
fit_kwargs = {}
|
||||||
if exposure is not None:
|
if exposure is not None:
|
||||||
fit_kwargs['exposure'] = exposure
|
additional_columns.append(exposure)
|
||||||
if family is not None:
|
if family is not None:
|
||||||
fit_kwargs['family'] = family
|
fit_kwargs['family'] = family
|
||||||
|
if status is not None:
|
||||||
|
additional_columns.append(status)
|
||||||
if method is not None:
|
if method is not None:
|
||||||
fit_kwargs['method'] = method
|
fit_kwargs['method'] = method
|
||||||
if maxiter is not None:
|
if maxiter is not None:
|
||||||
@ -161,6 +165,16 @@ def regress(
|
|||||||
if reduced is not None:
|
if reduced is not None:
|
||||||
fit_kwargs['reduced'] = reduced
|
fit_kwargs['reduced'] = reduced
|
||||||
|
|
||||||
|
# Preprocess data, check for NaN and get design matrices
|
||||||
|
df_ref = weakref.ref(df)
|
||||||
|
df_clean, dmatrices, dep_categories = df_to_dmatrices(df, dep, formula, nan_policy, additional_columns)
|
||||||
|
|
||||||
|
# Add function call arguments for supplemental columns - exposure, status, etc.
|
||||||
|
if exposure is not None:
|
||||||
|
fit_kwargs['exposure'] = df_clean[exposure]
|
||||||
|
if status is not None:
|
||||||
|
fit_kwargs['status'] = df_clean[status]
|
||||||
|
|
||||||
# Fit model
|
# Fit model
|
||||||
result = model_class.fit(dmatrices[0], dmatrices[1], **fit_kwargs)
|
result = model_class.fit(dmatrices[0], dmatrices[1], **fit_kwargs)
|
||||||
|
|
||||||
@ -197,7 +211,7 @@ def regress(
|
|||||||
result.terms[raw_name].categories[term] = cutoff_term
|
result.terms[raw_name].categories[term] = cutoff_term
|
||||||
else:
|
else:
|
||||||
# Parse if required
|
# Parse if required
|
||||||
factor, column, contrast = parse_patsy_term(formula, df, raw_name)
|
factor, column, contrast = parse_patsy_term(formula, df_clean, raw_name)
|
||||||
|
|
||||||
if contrast is not None:
|
if contrast is not None:
|
||||||
# Categorical term
|
# Categorical term
|
||||||
@ -208,7 +222,7 @@ def regress(
|
|||||||
else:
|
else:
|
||||||
# Add a new categorical term if not exists
|
# Add a new categorical term if not exists
|
||||||
if column not in result.terms:
|
if column not in result.terms:
|
||||||
ref_category = formula_factor_ref_category(formula, df, factor)
|
ref_category = formula_factor_ref_category(formula, df_clean, factor)
|
||||||
result.terms[column] = CategoricalTerm({}, ref_category)
|
result.terms[column] = CategoricalTerm({}, ref_category)
|
||||||
|
|
||||||
result.terms[column].categories[contrast] = raw_term
|
result.terms[column].categories[contrast] = raw_term
|
||||||
@ -218,10 +232,13 @@ def regress(
|
|||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def df_to_dmatrices(df, dep, formula, nan_policy):
|
def df_to_dmatrices(df, dep, formula, nan_policy, additional_columns=[]):
|
||||||
# Check for/clean NaNs in input columns
|
# Check for/clean NaNs in input columns
|
||||||
columns = cols_for_formula(dep, df) + cols_for_formula(formula, df)
|
columns = cols_for_formula(dep, df) + cols_for_formula(formula, df)
|
||||||
|
|
||||||
|
if additional_columns:
|
||||||
|
columns += additional_columns
|
||||||
|
|
||||||
df = df[columns]
|
df = df[columns]
|
||||||
df = check_nan(df, nan_policy)
|
df = check_nan(df, nan_policy)
|
||||||
|
|
||||||
@ -237,7 +254,7 @@ def df_to_dmatrices(df, dep, formula, nan_policy):
|
|||||||
# Construct design matrix from formula
|
# Construct design matrix from formula
|
||||||
dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe')
|
dmatrices = patsy.dmatrices(dep + ' ~ ' + formula, df, return_type='dataframe')
|
||||||
|
|
||||||
return dmatrices, dep_categories
|
return df, dmatrices, dep_categories
|
||||||
|
|
||||||
class RegressionModel:
|
class RegressionModel:
|
||||||
# TODO: Documentation
|
# TODO: Documentation
|
||||||
@ -630,19 +647,55 @@ class CategoricalTerm:
|
|||||||
#: Name of the reference category (*str*)
|
#: Name of the reference category (*str*)
|
||||||
self.ref_category = ref_category
|
self.ref_category = ref_category
|
||||||
|
|
||||||
def raw_terms_from_statsmodels_result(raw_result):
|
def raw_terms_from_statsmodels_result(raw_result, *, wrapped=True):
|
||||||
|
if wrapped:
|
||||||
|
zipped_iter = zip(raw_result.model.exog_names, raw_result.params.values, raw_result.conf_int(config.alpha).values, raw_result.pvalues.values)
|
||||||
|
else:
|
||||||
|
zipped_iter = zip(raw_result.model.exog_names, raw_result.params, raw_result.conf_int(config.alpha), raw_result.pvalues)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
('(Intercept)' if raw_name == 'Intercept' else raw_name): SingleTerm(
|
('(Intercept)' if raw_name == 'Intercept' else raw_name): SingleTerm(
|
||||||
raw_name=raw_name,
|
raw_name=raw_name,
|
||||||
beta=Estimate(raw_param, raw_ci[0], raw_ci[1]),
|
beta=Estimate(raw_param, raw_ci[0], raw_ci[1]),
|
||||||
pvalue=raw_p
|
pvalue=raw_p
|
||||||
)
|
)
|
||||||
for raw_name, raw_param, raw_ci, raw_p in zip(raw_result.model.exog_names, raw_result.params.values, raw_result.conf_int(config.alpha).values, raw_result.pvalues.values)
|
for raw_name, raw_param, raw_ci, raw_p in zipped_iter
|
||||||
}
|
}
|
||||||
|
|
||||||
# ------------------------
|
# ------------------------
|
||||||
# Concrete implementations
|
# Concrete implementations
|
||||||
|
|
||||||
|
class Cox(RegressionModel):
|
||||||
|
# TODO: Documentation
|
||||||
|
|
||||||
|
@property
|
||||||
|
def model_long_name(self):
|
||||||
|
return 'Cox Regression'
|
||||||
|
|
||||||
|
@property
|
||||||
|
def model_short_name(self):
|
||||||
|
return 'Cox'
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fit(cls, data_dep, data_ind, *, status=None):
|
||||||
|
# Drop explicit intercept term
|
||||||
|
if 'Intercept' in data_ind:
|
||||||
|
del data_ind['Intercept']
|
||||||
|
|
||||||
|
result = cls()
|
||||||
|
result.exp = True
|
||||||
|
result.cov_type = 'nonrobust'
|
||||||
|
|
||||||
|
# Perform regression
|
||||||
|
raw_result = sm.PHReg(endog=data_dep, exog=data_ind, status=status, missing='raise').fit(disp=False)
|
||||||
|
|
||||||
|
result.ll_model = raw_result.llf
|
||||||
|
|
||||||
|
result.terms = raw_terms_from_statsmodels_result(raw_result, wrapped=False)
|
||||||
|
result.vcov = raw_result.cov_params()
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
class GLM(RegressionModel):
|
class GLM(RegressionModel):
|
||||||
# TODO: Documentation
|
# TODO: Documentation
|
||||||
|
|
||||||
@ -1093,7 +1146,7 @@ class OrdinalLogit(RegressionModel):
|
|||||||
dep = self.dep
|
dep = self.dep
|
||||||
|
|
||||||
# Precompute design matrices
|
# Precompute design matrices
|
||||||
dmatrices, dep_categories = df_to_dmatrices(df, dep, self.formula, 'omit')
|
_, dmatrices, dep_categories = df_to_dmatrices(df, dep, self.formula, 'omit')
|
||||||
s_dep = dmatrices[0][dmatrices[0].columns[0]] # df[dep] as series - must get this from dmatrices to account for as_numeric
|
s_dep = dmatrices[0][dmatrices[0].columns[0]] # df[dep] as series - must get this from dmatrices to account for as_numeric
|
||||||
dmatrix_right = dmatrices[1]
|
dmatrix_right = dmatrices[1]
|
||||||
dmatrix_right.reset_index(drop=True, inplace=True) # Otherwise this confuses matrix multiplication
|
dmatrix_right.reset_index(drop=True, inplace=True) # Otherwise this confuses matrix multiplication
|
||||||
|
Loading…
Reference in New Issue
Block a user