Large refactor of yli.regress

This commit is contained in:
RunasSudo 2023-04-16 21:56:09 +10:00
parent dbfcec56c3
commit ac2aca7b8f
Signed by: RunasSudo
GPG Key ID: 7234E476BF21C61A
8 changed files with 701 additions and 979 deletions

View File

@ -1,5 +1,5 @@
# scipy-yli: Helpful SciPy utilities and recipes # scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022 Lee Yingtong Li (RunasSudo) # Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
# #
# This program is free software: you can redistribute it and/or modify # This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by # it under the terms of the GNU Affero General Public License as published by
@ -17,7 +17,6 @@
from pytest import approx from pytest import approx
import pandas as pd import pandas as pd
import statsmodels.api as sm
import yli import yli
@ -44,7 +43,7 @@ def test_regress_ftest_ol8_2():
'Score': [96, 79, 91, 85, 83, 91, 82, 87, 77, 76, 74, 73, 78, 71, 80, 66, 73, 69, 66, 77, 73, 71, 70, 74] 'Score': [96, 79, 91, 85, 83, 91, 82, 87, 77, 76, 74, 73, 78, 71, 80, 66, 73, 69, 66, 77, 73, 71, 70, 74]
}) })
result = yli.regress(sm.OLS, df, 'Score', 'C(Method)').ftest() result = yli.regress(yli.OLS, df, 'Score', 'C(Method)').ftest()
assert result.statistic == approx(545.316/18.4366, rel=0.001) assert result.statistic == approx(545.316/18.4366, rel=0.001)
assert result.dof1 == 2 assert result.dof1 == 2

View File

@ -1,5 +1,5 @@
# scipy-yli: Helpful SciPy utilities and recipes # scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022 Lee Yingtong Li (RunasSudo) # Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
# #
# This program is free software: you can redistribute it and/or modify # This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by # it under the terms of the GNU Affero General Public License as published by
@ -17,7 +17,6 @@
from pytest import approx from pytest import approx
import pandas as pd import pandas as pd
import statsmodels.api as sm
import yli import yli
@ -30,7 +29,7 @@ def test_afbf_logit_beta_zero():
'GammaGlobulin': [38, 36, 36, 36, 30, 31, 36, 37, 31, 38, 29, 46, 46, 31, 35, 37, 33, 37, 37, 34, 44, 28, 31, 39, 37, 39, 32, 38, 36, 32, 41, 30] 'GammaGlobulin': [38, 36, 36, 36, 30, 31, 36, 37, 31, 38, 29, 46, 46, 31, 35, 37, 33, 37, 37, 34, 44, 28, 31, 39, 37, 39, 32, 38, 36, 32, 41, 30]
}) })
result = yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin') result = yli.regress(yli.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
# model <- glm(Unhealthy ~ Fibrinogen + GammaGlobulin, data=df, family=binomial()) # model <- glm(Unhealthy ~ Fibrinogen + GammaGlobulin, data=df, family=binomial())
# bf_fit <- BF(model, hypothesis="Fibrinogen = 0") # bf_fit <- BF(model, hypothesis="Fibrinogen = 0")

View File

@ -1,5 +1,5 @@
# scipy-yli: Helpful SciPy utilities and recipes # scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022 Lee Yingtong Li (RunasSudo) # Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
# #
# This program is free software: you can redistribute it and/or modify # This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by # it under the terms of the GNU Affero General Public License as published by

View File

@ -42,15 +42,15 @@ def test_ordinallogit_ucla():
assert result.terms['(Cutoffs)'].categories['somewhat likely/very likely'].beta.ci_upper == approx(5.875195, abs=0.001) assert result.terms['(Cutoffs)'].categories['somewhat likely/very likely'].beta.ci_upper == approx(5.875195, abs=0.001)
expected_summary = ''' Ordinal Logistic Regression Results expected_summary = ''' Ordinal Logistic Regression Results
=============================================================== ==========================================================
Dep. Variable: apply | No. Observations: 400 Dep. Variable: apply | No. Observations: 400
Model: OrdinalLogit | Df. Model: 5 Model: Ordinal Logit | Df. Model: 5
Method: Maximum Likelihood | Df. Residuals: 395 Date: {0:%Y-%m-%d} | Df. Residuals: 395
Date: {0:%Y-%m-%d} | Pseudo : 0.03 Time: {0:%H:%M:%S} | Pseudo : 0.03
Time: {0:%H:%M:%S} | LL-Model: -358.51 Std. Errors: Non-Robust | LL-Model: -358.51
Std. Errors: Non-Robust | LL-Null: -370.60 | LL-Null: -370.60
| p (LR): <0.001* | p (LR): <0.001*
=============================================================== ============================================================
β (95% CI) p β (95% CI) p
------------------------------------------------------------ ------------------------------------------------------------
pared 1.05 (0.53 - 1.57) <0.001* pared 1.05 (0.53 - 1.57) <0.001*

View File

@ -1,5 +1,5 @@
# scipy-yli: Helpful SciPy utilities and recipes # scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022 Lee Yingtong Li (RunasSudo) # Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
# #
# This program is free software: you can redistribute it and/or modify # This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by # it under the terms of the GNU Affero General Public License as published by
@ -14,11 +14,11 @@
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>. # along with this program. If not, see <https://www.gnu.org/licenses/>.
import pytest
from pytest import approx from pytest import approx
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import statsmodels.api as sm
import yli import yli
from yli.regress import CategoricalTerm from yli.regress import CategoricalTerm
@ -31,7 +31,7 @@ def test_regress_ols_ol11_4():
'GrowthRet': [17.78, 21.59, 23.84, 15.13, 23.45, 20.87, 17.78, 20.09, 17.78, 12.46, 14.95, 15.87, 17.45, 14.35, 14.64, 17.25, 12.57, 7.15, 7.50, 4.34] 'GrowthRet': [17.78, 21.59, 23.84, 15.13, 23.45, 20.87, 17.78, 20.09, 17.78, 12.46, 14.95, 15.87, 17.45, 14.35, 14.64, 17.25, 12.57, 7.15, 7.50, 4.34]
}) })
result = yli.regress(sm.OLS, df, 'GrowthRet', 'SoilPh') result = yli.regress(yli.OLS, df, 'GrowthRet', 'SoilPh')
assert result.dof_model == 1 assert result.dof_model == 1
assert result.dof_resid == 18 assert result.dof_resid == 18
@ -47,8 +47,26 @@ def test_regress_ols_ol11_4():
assert result.terms['SoilPh'].beta.ci_lower == approx(-10.15, abs=0.01) assert result.terms['SoilPh'].beta.ci_lower == approx(-10.15, abs=0.01)
assert result.terms['SoilPh'].beta.ci_upper == approx(-5.57, abs=0.01) assert result.terms['SoilPh'].beta.ci_upper == approx(-5.57, abs=0.01)
expected_summary = ''' Ordinary Least Squares Regression Results
=======================================================
Dep. Variable: GrowthRet | No. Observations: 20
Model: OLS | Df. Model: 1
Date: {0:%Y-%m-%d} | Df. Residuals: 18
Time: {0:%H:%M:%S} | : 0.74
Std. Errors: Non-Robust | F: 52.01
| p (F): <0.001*
=======================================================
β (95% CI) p
----------------------------------------------
(Intercept) 47.48 (38.17 - 56.78) <0.001*
SoilPh -7.86 (-10.15 - -5.57) <0.001*
----------------------------------------------'''.format(result.fitted_dt)
assert result.summary() == expected_summary
@pytest.mark.skip('Not implemented in refactored regression implementation')
def test_regress_bootstrap_ols_ol11_4(): def test_regress_bootstrap_ols_ol11_4():
"""Compare RegressionResult.bootstrap for Ott & Longnecker (2016) example 11.4/11.7""" """Compare RegressionModel.bootstrap for Ott & Longnecker (2016) example 11.4/11.7"""
df = pd.DataFrame({ df = pd.DataFrame({
'SoilPh': [3.3, 3.4, 3.4, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 5.0, 5.1, 5.2], 'SoilPh': [3.3, 3.4, 3.4, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 5.0, 5.1, 5.2],
@ -86,7 +104,7 @@ def test_regress_ols_ol13_5():
}) })
df['LNC'] = np.log(df['C']) df['LNC'] = np.log(df['C'])
result = yli.regress(sm.OLS, df, 'LNC', 'D + T1 + T2 + S + PR + NE + CT + BW + N + PT') result = yli.regress(yli.OLS, df, 'LNC', 'D + T1 + T2 + S + PR + NE + CT + BW + N + PT')
assert result.dof_model == 10 assert result.dof_model == 10
assert result.dof_resid == 21 assert result.dof_resid == 21
@ -126,7 +144,7 @@ def test_regress_logit_ol12_23():
'GammaGlobulin': [38, 36, 36, 36, 30, 31, 36, 37, 31, 38, 29, 46, 46, 31, 35, 37, 33, 37, 37, 34, 44, 28, 31, 39, 37, 39, 32, 38, 36, 32, 41, 30] 'GammaGlobulin': [38, 36, 36, 36, 30, 31, 36, 37, 31, 38, 29, 46, 46, 31, 35, 37, 33, 37, 37, 34, 44, 28, 31, 39, 37, 39, 32, 38, 36, 32, 41, 30]
}) })
result = yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin') result = yli.regress(yli.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
# Some numerical differences as intercept term is very negative # Some numerical differences as intercept term is very negative
lrtest_result = result.lrtest_null() lrtest_result = result.lrtest_null()
@ -152,10 +170,10 @@ def test_regress_logit_ol12_23():
====================================================== ======================================================
Dep. Variable: Unhealthy | No. Observations: 32 Dep. Variable: Unhealthy | No. Observations: 32
Model: Logit | Df. Model: 2 Model: Logit | Df. Model: 2
Method: MLE | Df. Residuals: 29 Date: {0:%Y-%m-%d} | Df. Residuals: 29
Date: {0:%Y-%m-%d} | Pseudo : 0.26 Time: {0:%H:%M:%S} | Pseudo : 0.26
Time: {0:%H:%M:%S} | LL-Model: -11.47 Std. Errors: Non-Robust | LL-Model: -11.47
Std. Errors: Non-Robust | LL-Null: -15.44 | LL-Null: -15.44
| p (LR): 0.02* | p (LR): 0.02*
====================================================== ======================================================
exp(β) (95% CI) p exp(β) (95% CI) p
@ -182,7 +200,7 @@ def test_regress_logit_ol10_18():
'Stress': np.repeat([d[1] for d in data], [d[2] for d in data]) 'Stress': np.repeat([d[1] for d in data], [d[2] for d in data])
}) })
result = yli.regress(sm.Logit, df, 'Stress', 'Response', bool_baselevels=True) result = yli.regress(yli.Logit, df, 'Stress', 'Response', bool_baselevels=True)
assert isinstance(result.terms['Response'], CategoricalTerm) assert isinstance(result.terms['Response'], CategoricalTerm)
assert result.terms['Response'].ref_category == False assert result.terms['Response'].ref_category == False
@ -218,14 +236,14 @@ def test_regress_penalisedlogit_kleinman():
assert lrtest_result.pvalue < 0.0001 assert lrtest_result.pvalue < 0.0001
expected_summary = ''' Penalised Logistic Regression Results expected_summary = ''' Penalised Logistic Regression Results
========================================================= ============================================================
Dep. Variable: Outcome | No. Observations: 240 Dep. Variable: Outcome | No. Observations: 240
Model: Logit | Df. Model: 1 Model: Penalised Logit | Df. Model: 1
Method: Penalised ML | Pseudo : 0.37 Date: {0:%Y-%m-%d} | Pseudo : 0.37
Date: {0:%Y-%m-%d} | LL-Model: -66.43 Time: {0:%H:%M:%S} | LL-Model: -66.43
Time: {0:%H:%M:%S} | LL-Null: -105.91 Std. Errors: Non-Robust | LL-Null: -105.91
Std. Errors: Non-Robust | p (LR): <0.001* | p (LR): <0.001*
========================================================= ============================================================
β (95% CI) p β (95% CI) p
--------------------------------------------- ---------------------------------------------
(Intercept) -2.28 (-2.77 - -1.85) <0.001* (Intercept) -2.28 (-2.77 - -1.85) <0.001*

View File

@ -20,9 +20,9 @@ from .descriptives import auto_correlations, auto_descriptives
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
from .graphs import init_fonts from .graphs import init_fonts
from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, vif from .regress import Logit, OLS, OrdinalLogit, PenalisedLogit, regress, vif
from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, spearman, ttest_ind from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, spearman, ttest_ind
from .survival import cox_interval_censored, kaplanmeier, logrank, turnbull from .survival import kaplanmeier, logrank, turnbull
from .utils import as_ordinal from .utils import as_ordinal
def reload_me(): def reload_me():

File diff suppressed because it is too large Load Diff

View File

@ -20,11 +20,7 @@ import statsmodels.api as sm
from .config import config from .config import config
from .sig_tests import ChiSquaredResult from .sig_tests import ChiSquaredResult
from .regress import RegressionResult, SingleTerm from .utils import Estimate, check_nan
from .utils import Estimate, check_nan, cols_for_formula, convert_pandas_nullable
from datetime import datetime
import weakref
def kaplanmeier(df, time, status, by=None, *, ci=True, transform_x=None, transform_y=None, nan_policy='warn'): def kaplanmeier(df, time, status, by=None, *, ci=True, transform_x=None, transform_y=None, nan_policy='warn'):
""" """
@ -276,91 +272,3 @@ def logrank(df, time, status, by, nan_policy='warn'):
statistic, pvalue = sm.duration.survdiff(df[time], df[status], df[by]) statistic, pvalue = sm.duration.survdiff(df[time], df[status], df[by])
return ChiSquaredResult(statistic=statistic, dof=1, pvalue=pvalue) return ChiSquaredResult(statistic=statistic, dof=1, pvalue=pvalue)
# --------------------------------
# Interval-censored Cox regression
def cox_interval_censored(
df, time_left, time_right, formula, *,
bootstrap_samples=100,
nan_policy='warn',
bool_baselevels=False, exp=True,
):
# TODO: Documentation
df_ref = weakref.ref(df)
# Check for/clean NaNs in input columns
columns = [time_left, time_right] + cols_for_formula(formula, df)
df = df[columns]
df = check_nan(df, nan_policy)
# FIXME: Ensure numeric type for dependent variable
#df[dep], dep_categories = as_numeric(df[dep])
if df[time_left].dtype != 'float64' or df[time_right].dtype != 'float64':
raise NotImplementedError('Time dtypes must be float64')
# Convert pandas nullable types for independent variables
df = convert_pandas_nullable(df)
# ---------
# Fit model
# lifelines.CoxPHFitter doesn't do confidence intervals so we use R
import rpy2.robjects as ro
import rpy2.robjects.packages
import rpy2.robjects.pandas2ri
# Convert bool to int otherwise rpy2 chokes
df = df.replace({False: 0, True: 1})
# Import icenReg
ro.packages.importr('icenReg')
with ro.conversion.localconverter(ro.default_converter + ro.pandas2ri.converter):
with ro.local_context() as lc:
# Convert DataFrame to R
lc['df'] = df
# Transfer other parameters to R
lc['formula_'] = 'Surv({}, {}, type="interval2") ~ {}'.format(time_left, time_right, formula)
lc['bootstrap_samples'] = bootstrap_samples
# FIXME: Seed bootstrap RNG?
# Fit the model
ro.r('model <- ic_sp(as.formula(formula_), data=df, bs_samples=bootstrap_samples)')
model = ro.r('model')
# Hard to access attributes through rpy2
term_parameters = ro.r('model$coef')
term_names = ro.r('names(model$coef)')
term_cis = ro.r('confint(model)')
cov_matrix = ro.r('model$var')
llf = ro.r('model$llk')[0]
# TODO: Handle categorical terms?
terms = {}
for i in range(len(term_parameters)):
# These values not directly exposed so we must calculate them
se = np.sqrt(cov_matrix[i, i])
pvalue = 2 * stats.norm(loc=0, scale=se).cdf(-np.abs(term_parameters[i]))
term = SingleTerm(term_names[i], Estimate(term_parameters[i], term_cis[i][0], term_cis[i][1]), pvalue)
terms[term_names[i]] = term
result = RegressionResult(
None, df_ref, '({}, {}]'.format(time_left, time_right), formula, nan_policy, None, None,
model,
'Interval-Censored Cox Regression', 'CoxIC', 'MLE',
len(df), None, None, datetime.now(), 'Bootstrap',
terms,
llf, None,
None, None, None,
[],
exp
)
return result