From 2852d3dd198c3fc2696235d13ef0980d8c73df5d Mon Sep 17 00:00:00 2001 From: RunasSudo Date: Mon, 17 Apr 2023 22:38:44 +1000 Subject: [PATCH] Implement yli.IntervalCensoredCox --- README.md | 4 ++- docs/global.rst | 3 ++ yli/__init__.py | 2 +- yli/config.py | 3 ++ yli/regress.py | 95 +++++++++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 102 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index ca8a5e9..9cc030d 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,8 @@ The mandatory dependencies of this library are: Optional dependencies are: -* matplotlib and [seaborn](https://seaborn.pydata.org/), for plotting functions +* [hpstat](https://yingtongli.me/git/hpstat), for *IntervalCensoredCox* +* [matplotlib](https://matplotlib.org/) and [seaborn](https://seaborn.pydata.org/), for plotting functions * [mpmath](https://mpmath.org/), for *beta_ratio* and *beta_oddsratio* * [PyCryptodome](https://www.pycryptodome.org/), for *pickle_write_encrypted* and *pickle_read_encrypted* * [rpy2](https://rpy2.github.io/), with R packages: @@ -64,6 +65,7 @@ Relevant statistical functions are all directly available from the top-level *yl * *pearsonr*: Pearson correlation coefficient *r* * *ttest_ind*: Independent 2-sample *t* test * Regression: + * *IntervalCensoredCox*: Model for interval-censored Cox regression * *PenalisedLogit*: Model for Firth penalised logistic regression * *regress*: Fit arbitrary regression models * *vif*: Compute the variance inflation factor for independent variables in regression diff --git a/docs/global.rst b/docs/global.rst index 51ca341..c8eda50 100644 --- a/docs/global.rst +++ b/docs/global.rst @@ -17,3 +17,6 @@ Global options .. autoattribute:: yli.config.Config.repr_is_summary :annotation: = True + + .. autoattribute:: yli.config.Config.hpstat_path + :annotation: = True diff --git a/yli/__init__.py b/yli/__init__.py index eedef22..9c5f614 100644 --- a/yli/__init__.py +++ b/yli/__init__.py @@ -20,7 +20,7 @@ from .descriptives import auto_correlations, auto_descriptives from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist from .graphs import init_fonts from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted -from .regress import Logit, OLS, OrdinalLogit, PenalisedLogit, regress, vif +from .regress import IntervalCensoredCox, Logit, OLS, OrdinalLogit, PenalisedLogit, regress, vif from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, spearman, ttest_ind from .survival import kaplanmeier, logrank, turnbull from .utils import as_ordinal diff --git a/yli/config.py b/yli/config.py index 7e56694..f49aa1d 100644 --- a/yli/config.py +++ b/yli/config.py @@ -32,6 +32,9 @@ class Config: #: If enabled, `__repr__` on test results, etc. directly calls the ``summary`` function (*bool*) self.repr_is_summary = True + + #: Path to hpstat binary + self.hpstat_path = './hpstat' """Global configuration singleton""" config = Config() diff --git a/yli/regress.py b/yli/regress.py index 4d334ed..6afd89c 100644 --- a/yli/regress.py +++ b/yli/regress.py @@ -105,7 +105,7 @@ def regress(model_class, df, dep, formula, *, nan_policy='warn', bool_baselevels :type model_class: :class:`yli.regress.RegressionModel` subclass :param df: Data to perform regression on :type df: DataFrame - :param dep: Column in *df* for the dependent variable (numeric) + :param dep: Column(s) in *df* for the dependent variable (numeric) :type dep: str :param formula: Patsy formula for the regression model :type formula: str @@ -184,13 +184,16 @@ def regress(model_class, df, dep, formula, *, nan_policy='warn', bool_baselevels def df_to_dmatrices(df, dep, formula, nan_policy): # Check for/clean NaNs in input columns - columns = [dep] + cols_for_formula(formula, df) + columns = cols_for_formula(dep, df) + cols_for_formula(formula, df) df = df[columns] df = check_nan(df, nan_policy) # Ensure numeric type for dependent variable - df[dep], dep_categories = as_numeric(df[dep]) + if '+' in dep: + dep_categories = None + else: + df[dep], dep_categories = as_numeric(df[dep]) # Convert pandas nullable types for independent variables as this breaks statsmodels df = convert_pandas_nullable(df) @@ -575,6 +578,92 @@ def raw_terms_from_statsmodels_result(raw_result): # ------------------------ # Concrete implementations +class IntervalCensoredCox(RegressionModel): + """ + Interval-censored Cox regression + + Uses hpstat *intcox* command. + + **Example:** + + .. code-block:: + + df = pd.DataFrame(...) + yli.regress(yli.IntervalCensoredCox, df, 'Left_Time + Right_Time', 'A + B + C + D + E + F + G + H') + + .. code-block:: text + + Interval-Censored Cox Regression Results + =================================================================== + Dep. Variable: Left_Time + Right_Time | No. Observations: 1124 + Model: Interval-Censored Cox | No. Events: 133 + Date: 2023-04-17 | Df. Model: 8 + Time: 22:30:40 | Pseudo R²: 0.00 + Std. Errors: OPG | LL-Model: -613.21 + | LL-Null: -615.81 + | p (LR): 0.82 + =================================================================== + exp(β) (95% CI) p + ---------------------------------- + A 0.83 (0.37 - 1.88) 0.66 + B 1.08 (0.81 - 1.46) 0.60 + C 0.49 (0.24 - 1.00) 0.052 + D 0.79 (0.42 - 1.50) 0.48 + E 0.87 (0.40 - 1.85) 0.71 + F 0.64 (0.28 - 1.45) 0.29 + G 1.07 (0.44 - 2.62) 0.88 + H 1.23 (0.48 - 3.20) 0.67 + ---------------------------------- + + The output summarises the result of the regression. + + **Reference:** Zeng D, Mao L, Lin DY. Maximum likelihood estimation for semiparametric transformation models with interval-censored data. *Biometrika*. 2016;103(2):253–71. `doi:10.1093/biomet/asw013 `_ + """ + + @property + def model_long_name(self): + return 'Interval-Censored Cox Regression' + + @property + def model_short_name(self): + return 'Interval-Censored Cox' + + @classmethod + def fit(cls, data_dep, data_ind): + if len(data_dep.columns) != 2: + raise ValueError('IntervalCensoredCox requires left and right times') + + # Drop explicit intercept term + if 'Intercept' in data_ind: + del data_ind['Intercept'] + + result = cls() + result.exp = True + result.cov_type = 'OPG' + result.nevents = np.isfinite(data_dep.iloc[:, 1]).sum() + result.dof_model = len(data_ind.columns) + + # Export data to CSV + csv_buf = io.StringIO() + data_dep.join(data_ind).to_csv(csv_buf, index=False) + csv_str = csv_buf.getvalue() + + # Run intcens binary + proc = subprocess.run([config.hpstat_path, 'intcox', '-', '--output', 'json'], input=csv_str, capture_output=True, encoding='utf-8', check=True) + raw_result = json.loads(proc.stdout) + + z_critical = -stats.norm.ppf(config.alpha / 2) + result.terms = {raw_name: SingleTerm( + raw_name=raw_name, + beta=Estimate(raw_param, raw_param - z_critical * raw_se, raw_param + z_critical * raw_se), + pvalue=stats.norm.cdf(-np.abs(raw_param) / raw_se) * 2 + ) for raw_name, raw_param, raw_se in zip(data_ind.columns, raw_result['params'], raw_result['params_se'])} + + result.ll_model = raw_result['ll_model'] + result.ll_null = raw_result['ll_null'] + + return result + class Logit(RegressionModel): """ Logistic regression