From 2852d3dd198c3fc2696235d13ef0980d8c73df5d Mon Sep 17 00:00:00 2001
From: RunasSudo <runassudo@yingtongli.me>
Date: Mon, 17 Apr 2023 22:38:44 +1000
Subject: [PATCH] Implement yli.IntervalCensoredCox

---
 README.md       |  4 ++-
 docs/global.rst |  3 ++
 yli/__init__.py |  2 +-
 yli/config.py   |  3 ++
 yli/regress.py  | 95 +++++++++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 102 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index ca8a5e9..9cc030d 100644
--- a/README.md
+++ b/README.md
@@ -45,7 +45,8 @@ The mandatory dependencies of this library are:
 
 Optional dependencies are:
 
-* matplotlib and [seaborn](https://seaborn.pydata.org/), for plotting functions
+* [hpstat](https://yingtongli.me/git/hpstat), for *IntervalCensoredCox*
+* [matplotlib](https://matplotlib.org/) and [seaborn](https://seaborn.pydata.org/), for plotting functions
 * [mpmath](https://mpmath.org/), for *beta_ratio* and *beta_oddsratio*
 * [PyCryptodome](https://www.pycryptodome.org/), for *pickle_write_encrypted* and *pickle_read_encrypted*
 * [rpy2](https://rpy2.github.io/), with R packages:
@@ -64,6 +65,7 @@ Relevant statistical functions are all directly available from the top-level *yl
 	* *pearsonr*: Pearson correlation coefficient *r*
 	* *ttest_ind*: Independent 2-sample *t* test
 * Regression:
+	* *IntervalCensoredCox*: Model for interval-censored Cox regression
 	* *PenalisedLogit*: Model for Firth penalised logistic regression
 	* *regress*: Fit arbitrary regression models
 	* *vif*: Compute the variance inflation factor for independent variables in regression
diff --git a/docs/global.rst b/docs/global.rst
index 51ca341..c8eda50 100644
--- a/docs/global.rst
+++ b/docs/global.rst
@@ -17,3 +17,6 @@ Global options
 	
 	.. autoattribute:: yli.config.Config.repr_is_summary
 		:annotation: = True
+	
+	.. autoattribute:: yli.config.Config.hpstat_path
+		:annotation: = True
diff --git a/yli/__init__.py b/yli/__init__.py
index eedef22..9c5f614 100644
--- a/yli/__init__.py
+++ b/yli/__init__.py
@@ -20,7 +20,7 @@ from .descriptives import auto_correlations, auto_descriptives
 from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
 from .graphs import init_fonts
 from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
-from .regress import Logit, OLS, OrdinalLogit, PenalisedLogit, regress, vif
+from .regress import IntervalCensoredCox, Logit, OLS, OrdinalLogit, PenalisedLogit, regress, vif
 from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, spearman, ttest_ind
 from .survival import kaplanmeier, logrank, turnbull
 from .utils import as_ordinal
diff --git a/yli/config.py b/yli/config.py
index 7e56694..f49aa1d 100644
--- a/yli/config.py
+++ b/yli/config.py
@@ -32,6 +32,9 @@ class Config:
 		
 		#: If enabled, `__repr__` on test results, etc. directly calls the ``summary`` function (*bool*)
 		self.repr_is_summary = True
+		
+		#: Path to hpstat binary
+		self.hpstat_path = './hpstat'
 
 """Global configuration singleton"""
 config = Config()
diff --git a/yli/regress.py b/yli/regress.py
index 4d334ed..6afd89c 100644
--- a/yli/regress.py
+++ b/yli/regress.py
@@ -105,7 +105,7 @@ def regress(model_class, df, dep, formula, *, nan_policy='warn', bool_baselevels
 	:type model_class: :class:`yli.regress.RegressionModel` subclass
 	:param df: Data to perform regression on
 	:type df: DataFrame
-	:param dep: Column in *df* for the dependent variable (numeric)
+	:param dep: Column(s) in *df* for the dependent variable (numeric)
 	:type dep: str
 	:param formula: Patsy formula for the regression model
 	:type formula: str
@@ -184,13 +184,16 @@ def regress(model_class, df, dep, formula, *, nan_policy='warn', bool_baselevels
 
 def df_to_dmatrices(df, dep, formula, nan_policy):
 	# Check for/clean NaNs in input columns
-	columns = [dep] + cols_for_formula(formula, df)
+	columns = cols_for_formula(dep, df) + cols_for_formula(formula, df)
 	
 	df = df[columns]
 	df = check_nan(df, nan_policy)
 	
 	# Ensure numeric type for dependent variable
-	df[dep], dep_categories = as_numeric(df[dep])
+	if '+' in dep:
+		dep_categories = None
+	else:
+		df[dep], dep_categories = as_numeric(df[dep])
 	
 	# Convert pandas nullable types for independent variables as this breaks statsmodels
 	df = convert_pandas_nullable(df)
@@ -575,6 +578,92 @@ def raw_terms_from_statsmodels_result(raw_result):
 # ------------------------
 # Concrete implementations
 
+class IntervalCensoredCox(RegressionModel):
+	"""
+	Interval-censored Cox regression
+	
+	Uses hpstat *intcox* command.
+	
+	**Example:**
+	
+	.. code-block::
+		
+		df = pd.DataFrame(...)
+		yli.regress(yli.IntervalCensoredCox, df, 'Left_Time + Right_Time', 'A + B + C + D + E + F + G + H')
+	
+	.. code-block:: text
+		
+		              Interval-Censored Cox Regression Results             
+		===================================================================
+		Dep. Variable: Left_Time + Right_Time  |  No. Observations:    1124
+		        Model:  Interval-Censored Cox  |        No. Events:     133
+		         Date:             2023-04-17  |         Df. Model:       8
+		         Time:               22:30:40  |         Pseudo R²:    0.00
+		  Std. Errors:                    OPG  |          LL-Model: -613.21
+		                                       |           LL-Null: -615.81
+		                                       |            p (LR):    0.82
+		===================================================================
+		    exp(β)   (95% CI)         p   
+		----------------------------------
+		A     0.83 (0.37 - 1.88)    0.66  
+		B     1.08 (0.81 - 1.46)    0.60  
+		C     0.49 (0.24 - 1.00)    0.052 
+		D     0.79 (0.42 - 1.50)    0.48  
+		E     0.87 (0.40 - 1.85)    0.71  
+		F     0.64 (0.28 - 1.45)    0.29  
+		G     1.07 (0.44 - 2.62)    0.88  
+		H     1.23 (0.48 - 3.20)    0.67  
+		----------------------------------
+	
+	The output summarises the result of the regression.
+	
+	**Reference:** Zeng D, Mao L, Lin DY. Maximum likelihood estimation for semiparametric transformation models with interval-censored data. *Biometrika*. 2016;103(2):253–71. `doi:10.1093/biomet/asw013 <https://doi.org/10.1093/biomet/asw013>`_
+	"""
+	
+	@property
+	def model_long_name(self):
+		return 'Interval-Censored Cox Regression'
+	
+	@property
+	def model_short_name(self):
+		return 'Interval-Censored Cox'
+	
+	@classmethod
+	def fit(cls, data_dep, data_ind):
+		if len(data_dep.columns) != 2:
+			raise ValueError('IntervalCensoredCox requires left and right times')
+		
+		# Drop explicit intercept term
+		if 'Intercept' in data_ind:
+			del data_ind['Intercept']
+		
+		result = cls()
+		result.exp = True
+		result.cov_type = 'OPG'
+		result.nevents = np.isfinite(data_dep.iloc[:, 1]).sum()
+		result.dof_model = len(data_ind.columns)
+		
+		# Export data to CSV
+		csv_buf = io.StringIO()
+		data_dep.join(data_ind).to_csv(csv_buf, index=False)
+		csv_str = csv_buf.getvalue()
+		
+		# Run intcens binary
+		proc = subprocess.run([config.hpstat_path, 'intcox', '-', '--output', 'json'], input=csv_str, capture_output=True, encoding='utf-8', check=True)
+		raw_result = json.loads(proc.stdout)
+		
+		z_critical = -stats.norm.ppf(config.alpha / 2)
+		result.terms = {raw_name: SingleTerm(
+			raw_name=raw_name,
+			beta=Estimate(raw_param, raw_param - z_critical * raw_se, raw_param + z_critical * raw_se),
+			pvalue=stats.norm.cdf(-np.abs(raw_param) / raw_se) * 2
+		) for raw_name, raw_param, raw_se in zip(data_ind.columns, raw_result['params'], raw_result['params_se'])}
+		
+		result.ll_model = raw_result['ll_model']
+		result.ll_null = raw_result['ll_null']
+		
+		return result
+
 class Logit(RegressionModel):
 	"""
 	Logistic regression