Large refactor of yli.regress

2023-04-16 21:56:09 +10:00 · 2023-04-16 21:56:09 +10:00 · ac2aca7b8f
parent dbfcec56c3
commit ac2aca7b8f
8 changed files with 701 additions and 979 deletions
--- a/tests/test_anova.py
+++ b/tests/test_anova.py
@ -1,5 +1,5 @@
 #   scipy-yli: Helpful SciPy utilities and recipes
-#   Copyright © 2022  Lee Yingtong Li (RunasSudo)
+#   Copyright © 2022–2023  Lee Yingtong Li (RunasSudo)
 #
 #   This program is free software: you can redistribute it and/or modify
 #   it under the terms of the GNU Affero General Public License as published by
@ -17,7 +17,6 @@
 from pytest import approx
 import pandas as pd
 import statsmodels.api as sm
 import yli
@ -44,7 +43,7 @@ def test_regress_ftest_ol8_2():
 		'Score': [96, 79, 91, 85, 83, 91, 82, 87, 77, 76, 74, 73, 78, 71, 80, 66, 73, 69, 66, 77, 73, 71, 70, 74]
 	})
-	result = yli.regress(sm.OLS, df, 'Score', 'C(Method)').ftest()
+	result = yli.regress(yli.OLS, df, 'Score', 'C(Method)').ftest()
 	assert result.statistic == approx(545.316/18.4366, rel=0.001)
 	assert result.dof1 == 2
--- a/tests/test_bayes_factors.py
+++ b/tests/test_bayes_factors.py
@ -1,5 +1,5 @@
 #   scipy-yli: Helpful SciPy utilities and recipes
-#   Copyright © 2022  Lee Yingtong Li (RunasSudo)
+#   Copyright © 2022–2023  Lee Yingtong Li (RunasSudo)
 #
 #   This program is free software: you can redistribute it and/or modify
 #   it under the terms of the GNU Affero General Public License as published by
@ -17,7 +17,6 @@
 from pytest import approx
 import pandas as pd
 import statsmodels.api as sm
 import yli
@ -30,7 +29,7 @@ def test_afbf_logit_beta_zero():
 		'GammaGlobulin': [38, 36, 36, 36, 30, 31, 36, 37, 31, 38, 29, 46, 46, 31, 35, 37, 33, 37, 37, 34, 44, 28, 31, 39, 37, 39, 32, 38, 36, 32, 41, 30]
 	})
-	result = yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
+	result = yli.regress(yli.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
 	# model <- glm(Unhealthy ~ Fibrinogen + GammaGlobulin, data=df, family=binomial())
 	# bf_fit <- BF(model, hypothesis="Fibrinogen = 0")
--- a/tests/test_chi2.py
+++ b/tests/test_chi2.py
@ -1,5 +1,5 @@
 #   scipy-yli: Helpful SciPy utilities and recipes
-#   Copyright © 2022  Lee Yingtong Li (RunasSudo)
+#   Copyright © 2022–2023  Lee Yingtong Li (RunasSudo)
 #
 #   This program is free software: you can redistribute it and/or modify
 #   it under the terms of the GNU Affero General Public License as published by
@ -68,10 +68,10 @@ def test_chi2_ol10_18():
 	assert result.oddsratio.ci_lower == approx(1.113, abs=0.001)
 	assert result.oddsratio.ci_upper == approx(1.596, abs=0.001)
-	expected_summary = '''Stress    False  True
+	expected_summary = '''Stress    False  True 
-Response             
+Response              
-False       250   400
+False       250    400
-True        750  1600
+True        750   1600
 χ²(1) = 9.82; p = 0.002*
 OR (95% CI) = 1.33 (1.11–1.60)
--- a/tests/test_ordinallogit.py
+++ b/tests/test_ordinallogit.py
@ -41,16 +41,16 @@ def test_ordinallogit_ucla():
 	assert result.terms['(Cutoffs)'].categories['somewhat likely/very likely'].beta.ci_lower == approx(2.72234, abs=0.001)
 	assert result.terms['(Cutoffs)'].categories['somewhat likely/very likely'].beta.ci_upper == approx(5.875195, abs=0.001)
-	expected_summary = '''              Ordinal Logistic Regression Results              
+	expected_summary = '''           Ordinal Logistic Regression Results            
-===============================================================
+==========================================================
-Dep. Variable:              apply  |  No. Observations:     400
+Dep. Variable:         apply  |  No. Observations:     400
-        Model:       OrdinalLogit  |         Df. Model:       5
+        Model: Ordinal Logit  |         Df. Model:       5
-       Method: Maximum Likelihood  |     Df. Residuals:     395
+         Date:    {0:%Y-%m-%d}  |     Df. Residuals:     395
-         Date:         {0:%Y-%m-%d}  |         Pseudo R²:    0.03
+         Time:      {0:%H:%M:%S}  |         Pseudo R²:    0.03
-         Time:           {0:%H:%M:%S}  |          LL-Model: -358.51
+  Std. Errors:    Non-Robust  |          LL-Model: -358.51
-  Std. Errors:         Non-Robust  |           LL-Null: -370.60
+                              |           LL-Null: -370.60
-                                   |            p (LR): <0.001*
+                              |            p (LR): <0.001*
-===============================================================
+============================================================
                                β      (95% CI)         p   
 ------------------------------------------------------------
                      pared    1.05  (0.53 - 1.57)   <0.001*
--- a/tests/test_regress.py
+++ b/tests/test_regress.py
@ -1,5 +1,5 @@
 #   scipy-yli: Helpful SciPy utilities and recipes
-#   Copyright © 2022  Lee Yingtong Li (RunasSudo)
+#   Copyright © 2022–2023  Lee Yingtong Li (RunasSudo)
 #
 #   This program is free software: you can redistribute it and/or modify
 #   it under the terms of the GNU Affero General Public License as published by
@ -14,11 +14,11 @@
 #   You should have received a copy of the GNU Affero General Public License
 #   along with this program.  If not, see <https://www.gnu.org/licenses/>.
 import pytest
 from pytest import approx
 import numpy as np
 import pandas as pd
 import statsmodels.api as sm
 import yli
 from yli.regress import CategoricalTerm
@ -31,7 +31,7 @@ def test_regress_ols_ol11_4():
 		'GrowthRet': [17.78, 21.59, 23.84, 15.13, 23.45, 20.87, 17.78, 20.09, 17.78, 12.46, 14.95, 15.87, 17.45, 14.35, 14.64, 17.25, 12.57, 7.15, 7.50, 4.34]
 	})
-	result = yli.regress(sm.OLS, df, 'GrowthRet', 'SoilPh')
+	result = yli.regress(yli.OLS, df, 'GrowthRet', 'SoilPh')
 	assert result.dof_model == 1
 	assert result.dof_resid == 18
@ -46,9 +46,27 @@ def test_regress_ols_ol11_4():
 	assert result.terms['SoilPh'].beta.ci_lower == approx(-10.15, abs=0.01)
 	assert result.terms['SoilPh'].beta.ci_upper == approx(-5.57, abs=0.01)
 	expected_summary = '''       Ordinary Least Squares Regression Results       
 =======================================================
 Dep. Variable:  GrowthRet  |  No. Observations:      20
        Model:        OLS  |         Df. Model:       1
         Date: {0:%Y-%m-%d}  |     Df. Residuals:      18
         Time:   {0:%H:%M:%S}  |                R²:    0.74
  Std. Errors: Non-Robust  |                 F:   52.01
                           |             p (F): <0.001*
 =======================================================
                β       (95% CI)          p   
 ----------------------------------------------
 (Intercept)   47.48  (38.17 - 56.78)   <0.001*
     SoilPh   -7.86 (-10.15 - -5.57)   <0.001*
 ----------------------------------------------'''.format(result.fitted_dt)
 	assert result.summary() == expected_summary
@pytest.mark.skip('Not implemented in refactored regression implementation')
 def test_regress_bootstrap_ols_ol11_4():
-	"""Compare RegressionResult.bootstrap for Ott & Longnecker (2016) example 11.4/11.7"""
+	"""Compare RegressionModel.bootstrap for Ott & Longnecker (2016) example 11.4/11.7"""
 	df = pd.DataFrame({
 		'SoilPh': [3.3, 3.4, 3.4, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 5.0, 5.1, 5.2],
@ -86,7 +104,7 @@ def test_regress_ols_ol13_5():
 	})
 	df['LNC'] = np.log(df['C'])
-	result = yli.regress(sm.OLS, df, 'LNC', 'D + T1 + T2 + S + PR + NE + CT + BW + N + PT')
+	result = yli.regress(yli.OLS, df, 'LNC', 'D + T1 + T2 + S + PR + NE + CT + BW + N + PT')
 	assert result.dof_model == 10
 	assert result.dof_resid == 21
@ -126,7 +144,7 @@ def test_regress_logit_ol12_23():
 		'GammaGlobulin': [38, 36, 36, 36, 30, 31, 36, 37, 31, 38, 29, 46, 46, 31, 35, 37, 33, 37, 37, 34, 44, 28, 31, 39, 37, 39, 32, 38, 36, 32, 41, 30]
 	})
-	result = yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
+	result = yli.regress(yli.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
 	# Some numerical differences as intercept term is very negative
 	lrtest_result = result.lrtest_null()
@ -152,10 +170,10 @@ def test_regress_logit_ol12_23():
 ======================================================
 Dep. Variable:  Unhealthy  |  No. Observations:     32
        Model:      Logit  |         Df. Model:      2
-       Method:        MLE  |     Df. Residuals:     29
+         Date: {0:%Y-%m-%d}  |     Df. Residuals:     29
-         Date: {0:%Y-%m-%d}  |         Pseudo R²:   0.26
+         Time:   {0:%H:%M:%S}  |         Pseudo R²:   0.26
-         Time:   {0:%H:%M:%S}  |          LL-Model: -11.47
+  Std. Errors: Non-Robust  |          LL-Model: -11.47
-  Std. Errors: Non-Robust  |           LL-Null: -15.44
+                           |           LL-Null: -15.44
                           |            p (LR):  0.02*
 ======================================================
                exp(β)   (95% CI)          p   
@ -182,7 +200,7 @@ def test_regress_logit_ol10_18():
 		'Stress': np.repeat([d[1] for d in data], [d[2] for d in data])
 	})
-	result = yli.regress(sm.Logit, df, 'Stress', 'Response', bool_baselevels=True)
+	result = yli.regress(yli.Logit, df, 'Stress', 'Response', bool_baselevels=True)
 	assert isinstance(result.terms['Response'], CategoricalTerm)
 	assert result.terms['Response'].ref_category == False
@ -217,15 +235,15 @@ def test_regress_penalisedlogit_kleinman():
 	assert lrtest_result.dof == 1
 	assert lrtest_result.pvalue < 0.0001
-	expected_summary = '''          Penalised Logistic Regression Results          
+	expected_summary = '''           Penalised Logistic Regression Results            
-=========================================================
+============================================================
-Dep. Variable:      Outcome  |  No. Observations:     240
+Dep. Variable:         Outcome  |  No. Observations:     240
-        Model:        Logit  |         Df. Model:       1
+        Model: Penalised Logit  |         Df. Model:       1
-       Method: Penalised ML  |         Pseudo R²:    0.37
+         Date:      {0:%Y-%m-%d}  |         Pseudo R²:    0.37
-         Date:   {0:%Y-%m-%d}  |          LL-Model:  -66.43
+         Time:        {0:%H:%M:%S}  |          LL-Model:  -66.43
-         Time:     {0:%H:%M:%S}  |           LL-Null: -105.91
+  Std. Errors:      Non-Robust  |           LL-Null: -105.91
-  Std. Errors:   Non-Robust  |            p (LR): <0.001*
+                                |            p (LR): <0.001*
-=========================================================
+============================================================
                β      (95% CI)          p   
 ---------------------------------------------
 (Intercept)   -2.28 (-2.77 - -1.85)   <0.001*
--- a/yli/init.py
+++ b/yli/init.py
@ -20,9 +20,9 @@ from .descriptives import auto_correlations, auto_descriptives
 from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
 from .graphs import init_fonts
 from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
-from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, vif
+from .regress import Logit, OLS, OrdinalLogit, PenalisedLogit, regress, vif
 from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, spearman, ttest_ind
-from .survival import cox_interval_censored, kaplanmeier, logrank, turnbull
+from .survival import kaplanmeier, logrank, turnbull
 from .utils import as_ordinal
 def reload_me():
--- a/yli/regress.py
+++ b/yli/regress.py
--- a/yli/survival.py
+++ b/yli/survival.py
@ -20,11 +20,7 @@ import statsmodels.api as sm
 from .config import config
 from .sig_tests import ChiSquaredResult
-from .regress import RegressionResult, SingleTerm
+from .utils import Estimate, check_nan
 from .utils import Estimate, check_nan, cols_for_formula, convert_pandas_nullable
 from datetime import datetime
 import weakref
 def kaplanmeier(df, time, status, by=None, *, ci=True, transform_x=None, transform_y=None, nan_policy='warn'):
 	"""
@ -276,91 +272,3 @@ def logrank(df, time, status, by, nan_policy='warn'):
 	statistic, pvalue = sm.duration.survdiff(df[time], df[status], df[by])
 	return ChiSquaredResult(statistic=statistic, dof=1, pvalue=pvalue)
 # --------------------------------
 # Interval-censored Cox regression
 def cox_interval_censored(
 	df, time_left, time_right, formula, *,
 	bootstrap_samples=100,
 	nan_policy='warn',
 	bool_baselevels=False, exp=True,
 ):
 	# TODO: Documentation
 	df_ref = weakref.ref(df)
 	# Check for/clean NaNs in input columns
 	columns = [time_left, time_right] + cols_for_formula(formula, df)
 	df = df[columns]
 	df = check_nan(df, nan_policy)
 	# FIXME: Ensure numeric type for dependent variable
 	#df[dep], dep_categories = as_numeric(df[dep])
 	if df[time_left].dtype != 'float64' or df[time_right].dtype != 'float64':
 		raise NotImplementedError('Time dtypes must be float64')
 	# Convert pandas nullable types for independent variables
 	df = convert_pandas_nullable(df)
 	# ---------
 	# Fit model
 	# lifelines.CoxPHFitter doesn't do confidence intervals so we use R
 	import rpy2.robjects as ro
 	import rpy2.robjects.packages
 	import rpy2.robjects.pandas2ri
 	# Convert bool to int otherwise rpy2 chokes
 	df = df.replace({False: 0, True: 1})
 	# Import icenReg
 	ro.packages.importr('icenReg')
 	with ro.conversion.localconverter(ro.default_converter + ro.pandas2ri.converter):
 		with ro.local_context() as lc:
 			# Convert DataFrame to R
 			lc['df'] = df
 			# Transfer other parameters to R
 			lc['formula_'] = 'Surv({}, {}, type="interval2") ~ {}'.format(time_left, time_right, formula)
 			lc['bootstrap_samples'] = bootstrap_samples
 			# FIXME: Seed bootstrap RNG?
 			# Fit the model
 			ro.r('model <- ic_sp(as.formula(formula_), data=df, bs_samples=bootstrap_samples)')
 			model = ro.r('model')
 			# Hard to access attributes through rpy2
 			term_parameters = ro.r('model$coef')
 			term_names = ro.r('names(model$coef)')
 			term_cis = ro.r('confint(model)')
 			cov_matrix = ro.r('model$var')
 			llf = ro.r('model$llk')[0]
 			# TODO: Handle categorical terms?
 			terms = {}
 			for i in range(len(term_parameters)):
 				# These values not directly exposed so we must calculate them
 				se = np.sqrt(cov_matrix[i, i])
 				pvalue = 2 * stats.norm(loc=0, scale=se).cdf(-np.abs(term_parameters[i]))
 				term = SingleTerm(term_names[i], Estimate(term_parameters[i], term_cis[i][0], term_cis[i][1]), pvalue)
 				terms[term_names[i]] = term
 			result = RegressionResult(
 				None, df_ref, '({}, {}]'.format(time_left, time_right), formula, nan_policy, None, None,
 				model,
 				'Interval-Censored Cox Regression', 'CoxIC', 'MLE',
 				len(df), None, None, datetime.now(), 'Bootstrap',
 				terms,
 				llf, None,
 				None, None, None,
 				[],
 				exp
 			)
 	return result