Large refactor of yli.regress

2023-04-16 21:56:09 +10:00 · 2023-04-16 21:56:09 +10:00 · ac2aca7b8f
commit ac2aca7b8f
parent dbfcec56c3
8 changed files with 701 additions and 979 deletions
--- a/tests/test_anova.py
+++ b/tests/test_anova.py
@ -1,5 +1,5 @@
 #   scipy-yli: Helpful SciPy utilities and recipes
-#   Copyright © 2022  Lee Yingtong Li (RunasSudo)
+#   Copyright © 2022–2023  Lee Yingtong Li (RunasSudo)
 #
 #   This program is free software: you can redistribute it and/or modify
 #   it under the terms of the GNU Affero General Public License as published by
@ -17,7 +17,6 @@
 from pytest import approx

 import pandas as pd
-import statsmodels.api as sm

 import yli

@ -44,7 +43,7 @@ def test_regress_ftest_ol8_2():
 		'Score': [96, 79, 91, 85, 83, 91, 82, 87, 77, 76, 74, 73, 78, 71, 80, 66, 73, 69, 66, 77, 73, 71, 70, 74]
 	})
 	
-	result = yli.regress(sm.OLS, df, 'Score', 'C(Method)').ftest()
+	result = yli.regress(yli.OLS, df, 'Score', 'C(Method)').ftest()
 	
 	assert result.statistic == approx(545.316/18.4366, rel=0.001)
 	assert result.dof1 == 2
--- a/tests/test_bayes_factors.py
+++ b/tests/test_bayes_factors.py
@ -1,5 +1,5 @@
 #   scipy-yli: Helpful SciPy utilities and recipes
-#   Copyright © 2022  Lee Yingtong Li (RunasSudo)
+#   Copyright © 2022–2023  Lee Yingtong Li (RunasSudo)
 #
 #   This program is free software: you can redistribute it and/or modify
 #   it under the terms of the GNU Affero General Public License as published by
@ -17,7 +17,6 @@
 from pytest import approx

 import pandas as pd
-import statsmodels.api as sm

 import yli

@ -30,7 +29,7 @@ def test_afbf_logit_beta_zero():
 		'GammaGlobulin': [38, 36, 36, 36, 30, 31, 36, 37, 31, 38, 29, 46, 46, 31, 35, 37, 33, 37, 37, 34, 44, 28, 31, 39, 37, 39, 32, 38, 36, 32, 41, 30]
 	})
 	
-	result = yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
+	result = yli.regress(yli.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
 	
 	# model <- glm(Unhealthy ~ Fibrinogen + GammaGlobulin, data=df, family=binomial())
 	# bf_fit <- BF(model, hypothesis="Fibrinogen = 0")
--- a/tests/test_chi2.py
+++ b/tests/test_chi2.py
@ -1,5 +1,5 @@
 #   scipy-yli: Helpful SciPy utilities and recipes
-#   Copyright © 2022  Lee Yingtong Li (RunasSudo)
+#   Copyright © 2022–2023  Lee Yingtong Li (RunasSudo)
 #
 #   This program is free software: you can redistribute it and/or modify
 #   it under the terms of the GNU Affero General Public License as published by
@ -68,10 +68,10 @@ def test_chi2_ol10_18():
 	assert result.oddsratio.ci_lower == approx(1.113, abs=0.001)
 	assert result.oddsratio.ci_upper == approx(1.596, abs=0.001)
 	
-	expected_summary = '''Stress    False  True
-Response             
-False       250   400
-True        750  1600
+	expected_summary = '''Stress    False  True 
+Response              
+False       250    400
+True        750   1600

 χ²(1) = 9.82; p = 0.002*
 OR (95% CI) = 1.33 (1.11–1.60)
--- a/tests/test_ordinallogit.py
+++ b/tests/test_ordinallogit.py
@ -41,16 +41,16 @@ def test_ordinallogit_ucla():
 	assert result.terms['(Cutoffs)'].categories['somewhat likely/very likely'].beta.ci_lower == approx(2.72234, abs=0.001)
 	assert result.terms['(Cutoffs)'].categories['somewhat likely/very likely'].beta.ci_upper == approx(5.875195, abs=0.001)
 	
-	expected_summary = '''              Ordinal Logistic Regression Results              
-===============================================================
-Dep. Variable:              apply  |  No. Observations:     400
-        Model:       OrdinalLogit  |         Df. Model:       5
-       Method: Maximum Likelihood  |     Df. Residuals:     395
-         Date:         {0:%Y-%m-%d}  |         Pseudo R²:    0.03
-         Time:           {0:%H:%M:%S}  |          LL-Model: -358.51
-  Std. Errors:         Non-Robust  |           LL-Null: -370.60
-                                   |            p (LR): <0.001*
-===============================================================
+	expected_summary = '''           Ordinal Logistic Regression Results            
+==========================================================
+Dep. Variable:         apply  |  No. Observations:     400
+        Model: Ordinal Logit  |         Df. Model:       5
+         Date:    {0:%Y-%m-%d}  |     Df. Residuals:     395
+         Time:      {0:%H:%M:%S}  |         Pseudo R²:    0.03
+  Std. Errors:    Non-Robust  |          LL-Model: -358.51
+                              |           LL-Null: -370.60
+                              |            p (LR): <0.001*
+============================================================
                                β      (95% CI)         p   
 ------------------------------------------------------------
                      pared    1.05  (0.53 - 1.57)   <0.001*
--- a/tests/test_regress.py
+++ b/tests/test_regress.py
@ -1,5 +1,5 @@
 #   scipy-yli: Helpful SciPy utilities and recipes
-#   Copyright © 2022  Lee Yingtong Li (RunasSudo)
+#   Copyright © 2022–2023  Lee Yingtong Li (RunasSudo)
 #
 #   This program is free software: you can redistribute it and/or modify
 #   it under the terms of the GNU Affero General Public License as published by
@ -14,11 +14,11 @@
 #   You should have received a copy of the GNU Affero General Public License
 #   along with this program.  If not, see <https://www.gnu.org/licenses/>.

+import pytest
 from pytest import approx

 import numpy as np
 import pandas as pd
-import statsmodels.api as sm

 import yli
 from yli.regress import CategoricalTerm
@ -31,7 +31,7 @@ def test_regress_ols_ol11_4():
 		'GrowthRet': [17.78, 21.59, 23.84, 15.13, 23.45, 20.87, 17.78, 20.09, 17.78, 12.46, 14.95, 15.87, 17.45, 14.35, 14.64, 17.25, 12.57, 7.15, 7.50, 4.34]
 	})
 	
-	result = yli.regress(sm.OLS, df, 'GrowthRet', 'SoilPh')
+	result = yli.regress(yli.OLS, df, 'GrowthRet', 'SoilPh')
 	
 	assert result.dof_model == 1
 	assert result.dof_resid == 18
@ -46,9 +46,27 @@ def test_regress_ols_ol11_4():
 	
 	assert result.terms['SoilPh'].beta.ci_lower == approx(-10.15, abs=0.01)
 	assert result.terms['SoilPh'].beta.ci_upper == approx(-5.57, abs=0.01)
+	
+	expected_summary = '''       Ordinary Least Squares Regression Results       
+=======================================================
+Dep. Variable:  GrowthRet  |  No. Observations:      20
+        Model:        OLS  |         Df. Model:       1
+         Date: {0:%Y-%m-%d}  |     Df. Residuals:      18
+         Time:   {0:%H:%M:%S}  |                R²:    0.74
+  Std. Errors: Non-Robust  |                 F:   52.01
+                           |             p (F): <0.001*
+=======================================================
+                β       (95% CI)          p   
+----------------------------------------------
+(Intercept)   47.48  (38.17 - 56.78)   <0.001*
+     SoilPh   -7.86 (-10.15 - -5.57)   <0.001*
+----------------------------------------------'''.format(result.fitted_dt)
+	
+	assert result.summary() == expected_summary

+@pytest.mark.skip('Not implemented in refactored regression implementation')
 def test_regress_bootstrap_ols_ol11_4():
-	"""Compare RegressionResult.bootstrap for Ott & Longnecker (2016) example 11.4/11.7"""
+	"""Compare RegressionModel.bootstrap for Ott & Longnecker (2016) example 11.4/11.7"""
 	
 	df = pd.DataFrame({
 		'SoilPh': [3.3, 3.4, 3.4, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 5.0, 5.1, 5.2],
@ -86,7 +104,7 @@ def test_regress_ols_ol13_5():
 	})
 	df['LNC'] = np.log(df['C'])
 	
-	result = yli.regress(sm.OLS, df, 'LNC', 'D + T1 + T2 + S + PR + NE + CT + BW + N + PT')
+	result = yli.regress(yli.OLS, df, 'LNC', 'D + T1 + T2 + S + PR + NE + CT + BW + N + PT')
 	
 	assert result.dof_model == 10
 	assert result.dof_resid == 21
@ -126,7 +144,7 @@ def test_regress_logit_ol12_23():
 		'GammaGlobulin': [38, 36, 36, 36, 30, 31, 36, 37, 31, 38, 29, 46, 46, 31, 35, 37, 33, 37, 37, 34, 44, 28, 31, 39, 37, 39, 32, 38, 36, 32, 41, 30]
 	})
 	
-	result = yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
+	result = yli.regress(yli.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
 	
 	# Some numerical differences as intercept term is very negative
 	lrtest_result = result.lrtest_null()
@ -152,10 +170,10 @@ def test_regress_logit_ol12_23():
 ======================================================
 Dep. Variable:  Unhealthy  |  No. Observations:     32
        Model:      Logit  |         Df. Model:      2
-       Method:        MLE  |     Df. Residuals:     29
-         Date: {0:%Y-%m-%d}  |         Pseudo R²:   0.26
-         Time:   {0:%H:%M:%S}  |          LL-Model: -11.47
-  Std. Errors: Non-Robust  |           LL-Null: -15.44
+         Date: {0:%Y-%m-%d}  |     Df. Residuals:     29
+         Time:   {0:%H:%M:%S}  |         Pseudo R²:   0.26
+  Std. Errors: Non-Robust  |          LL-Model: -11.47
+                           |           LL-Null: -15.44
                           |            p (LR):  0.02*
 ======================================================
                exp(β)   (95% CI)          p   
@ -182,7 +200,7 @@ def test_regress_logit_ol10_18():
 		'Stress': np.repeat([d[1] for d in data], [d[2] for d in data])
 	})
 	
-	result = yli.regress(sm.Logit, df, 'Stress', 'Response', bool_baselevels=True)
+	result = yli.regress(yli.Logit, df, 'Stress', 'Response', bool_baselevels=True)
 	
 	assert isinstance(result.terms['Response'], CategoricalTerm)
 	assert result.terms['Response'].ref_category == False
@ -217,15 +235,15 @@ def test_regress_penalisedlogit_kleinman():
 	assert lrtest_result.dof == 1
 	assert lrtest_result.pvalue < 0.0001
 	
-	expected_summary = '''          Penalised Logistic Regression Results          
-=========================================================
-Dep. Variable:      Outcome  |  No. Observations:     240
-        Model:        Logit  |         Df. Model:       1
-       Method: Penalised ML  |         Pseudo R²:    0.37
-         Date:   {0:%Y-%m-%d}  |          LL-Model:  -66.43
-         Time:     {0:%H:%M:%S}  |           LL-Null: -105.91
-  Std. Errors:   Non-Robust  |            p (LR): <0.001*
-=========================================================
+	expected_summary = '''           Penalised Logistic Regression Results            
+============================================================
+Dep. Variable:         Outcome  |  No. Observations:     240
+        Model: Penalised Logit  |         Df. Model:       1
+         Date:      {0:%Y-%m-%d}  |         Pseudo R²:    0.37
+         Time:        {0:%H:%M:%S}  |          LL-Model:  -66.43
+  Std. Errors:      Non-Robust  |           LL-Null: -105.91
+                                |            p (LR): <0.001*
+============================================================
                β      (95% CI)          p   
 ---------------------------------------------
 (Intercept)   -2.28 (-2.77 - -1.85)   <0.001*
--- a/yli/init.py
+++ b/yli/init.py
@ -20,9 +20,9 @@ from .descriptives import auto_correlations, auto_descriptives
 from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
 from .graphs import init_fonts
 from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
-from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, vif
+from .regress import Logit, OLS, OrdinalLogit, PenalisedLogit, regress, vif
 from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, spearman, ttest_ind
-from .survival import cox_interval_censored, kaplanmeier, logrank, turnbull
+from .survival import kaplanmeier, logrank, turnbull
 from .utils import as_ordinal

 def reload_me():
--- a/yli/regress.py
+++ b/yli/regress.py
--- a/yli/survival.py
+++ b/yli/survival.py
@ -20,11 +20,7 @@ import statsmodels.api as sm

 from .config import config
 from .sig_tests import ChiSquaredResult
-from .regress import RegressionResult, SingleTerm
-from .utils import Estimate, check_nan, cols_for_formula, convert_pandas_nullable
-
-from datetime import datetime
-import weakref
+from .utils import Estimate, check_nan

 def kaplanmeier(df, time, status, by=None, *, ci=True, transform_x=None, transform_y=None, nan_policy='warn'):
 	"""
@ -276,91 +272,3 @@ def logrank(df, time, status, by, nan_policy='warn'):
 	statistic, pvalue = sm.duration.survdiff(df[time], df[status], df[by])
 	
 	return ChiSquaredResult(statistic=statistic, dof=1, pvalue=pvalue)
-
-# --------------------------------
-# Interval-censored Cox regression
-
-def cox_interval_censored(
-	df, time_left, time_right, formula, *,
-	bootstrap_samples=100,
-	nan_policy='warn',
-	bool_baselevels=False, exp=True,
-):
-	# TODO: Documentation
-	
-	df_ref = weakref.ref(df)
-	
-	# Check for/clean NaNs in input columns
-	columns = [time_left, time_right] + cols_for_formula(formula, df)
-	
-	df = df[columns]
-	df = check_nan(df, nan_policy)
-	
-	# FIXME: Ensure numeric type for dependent variable
-	#df[dep], dep_categories = as_numeric(df[dep])
-	if df[time_left].dtype != 'float64' or df[time_right].dtype != 'float64':
-		raise NotImplementedError('Time dtypes must be float64')
-	
-	# Convert pandas nullable types for independent variables
-	df = convert_pandas_nullable(df)
-	
-	# ---------
-	# Fit model
-	
-	# lifelines.CoxPHFitter doesn't do confidence intervals so we use R
-	
-	import rpy2.robjects as ro
-	import rpy2.robjects.packages
-	import rpy2.robjects.pandas2ri
-	
-	# Convert bool to int otherwise rpy2 chokes
-	df = df.replace({False: 0, True: 1})
-	
-	# Import icenReg
-	ro.packages.importr('icenReg')
-	
-	with ro.conversion.localconverter(ro.default_converter + ro.pandas2ri.converter):
-		with ro.local_context() as lc:
-			# Convert DataFrame to R
-			lc['df'] = df
-			
-			# Transfer other parameters to R
-			lc['formula_'] = 'Surv({}, {}, type="interval2") ~ {}'.format(time_left, time_right, formula)
-			lc['bootstrap_samples'] = bootstrap_samples
-			
-			# FIXME: Seed bootstrap RNG?
-			
-			# Fit the model
-			ro.r('model <- ic_sp(as.formula(formula_), data=df, bs_samples=bootstrap_samples)')
-			
-			model = ro.r('model')
-			# Hard to access attributes through rpy2
-			term_parameters = ro.r('model$coef')
-			term_names = ro.r('names(model$coef)')
-			term_cis = ro.r('confint(model)')
-			cov_matrix = ro.r('model$var')
-			llf = ro.r('model$llk')[0]
-			
-			# TODO: Handle categorical terms?
-			terms = {}
-			for i in range(len(term_parameters)):
-				# These values not directly exposed so we must calculate them
-				se = np.sqrt(cov_matrix[i, i])
-				pvalue = 2 * stats.norm(loc=0, scale=se).cdf(-np.abs(term_parameters[i]))
-				
-				term = SingleTerm(term_names[i], Estimate(term_parameters[i], term_cis[i][0], term_cis[i][1]), pvalue)
-				terms[term_names[i]] = term
-			
-			result = RegressionResult(
-				None, df_ref, '({}, {}]'.format(time_left, time_right), formula, nan_policy, None, None,
-				model,
-				'Interval-Censored Cox Regression', 'CoxIC', 'MLE',
-				len(df), None, None, datetime.now(), 'Bootstrap',
-				terms,
-				llf, None,
-				None, None, None,
-				[],
-				exp
-			)
-	
-	return result