Large refactor of yli.regress

This commit is contained in:
RunasSudo 2023-04-16 21:56:09 +10:00
parent dbfcec56c3
commit ac2aca7b8f
Signed by: RunasSudo
GPG Key ID: 7234E476BF21C61A
8 changed files with 701 additions and 979 deletions

View File

@ -1,5 +1,5 @@
# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
@ -17,7 +17,6 @@
from pytest import approx
import pandas as pd
import statsmodels.api as sm
import yli
@ -44,7 +43,7 @@ def test_regress_ftest_ol8_2():
'Score': [96, 79, 91, 85, 83, 91, 82, 87, 77, 76, 74, 73, 78, 71, 80, 66, 73, 69, 66, 77, 73, 71, 70, 74]
})
result = yli.regress(sm.OLS, df, 'Score', 'C(Method)').ftest()
result = yli.regress(yli.OLS, df, 'Score', 'C(Method)').ftest()
assert result.statistic == approx(545.316/18.4366, rel=0.001)
assert result.dof1 == 2

View File

@ -1,5 +1,5 @@
# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
@ -17,7 +17,6 @@
from pytest import approx
import pandas as pd
import statsmodels.api as sm
import yli
@ -30,7 +29,7 @@ def test_afbf_logit_beta_zero():
'GammaGlobulin': [38, 36, 36, 36, 30, 31, 36, 37, 31, 38, 29, 46, 46, 31, 35, 37, 33, 37, 37, 34, 44, 28, 31, 39, 37, 39, 32, 38, 36, 32, 41, 30]
})
result = yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
result = yli.regress(yli.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
# model <- glm(Unhealthy ~ Fibrinogen + GammaGlobulin, data=df, family=binomial())
# bf_fit <- BF(model, hypothesis="Fibrinogen = 0")

View File

@ -1,5 +1,5 @@
# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
@ -68,10 +68,10 @@ def test_chi2_ol10_18():
assert result.oddsratio.ci_lower == approx(1.113, abs=0.001)
assert result.oddsratio.ci_upper == approx(1.596, abs=0.001)
expected_summary = '''Stress False True
Response
False 250 400
True 750 1600
expected_summary = '''Stress False True
Response
False 250 400
True 750 1600
χ²(1) = 9.82; p = 0.002*
OR (95% CI) = 1.33 (1.111.60)

View File

@ -41,16 +41,16 @@ def test_ordinallogit_ucla():
assert result.terms['(Cutoffs)'].categories['somewhat likely/very likely'].beta.ci_lower == approx(2.72234, abs=0.001)
assert result.terms['(Cutoffs)'].categories['somewhat likely/very likely'].beta.ci_upper == approx(5.875195, abs=0.001)
expected_summary = ''' Ordinal Logistic Regression Results
===============================================================
Dep. Variable: apply | No. Observations: 400
Model: OrdinalLogit | Df. Model: 5
Method: Maximum Likelihood | Df. Residuals: 395
Date: {0:%Y-%m-%d} | Pseudo : 0.03
Time: {0:%H:%M:%S} | LL-Model: -358.51
Std. Errors: Non-Robust | LL-Null: -370.60
| p (LR): <0.001*
===============================================================
expected_summary = ''' Ordinal Logistic Regression Results
==========================================================
Dep. Variable: apply | No. Observations: 400
Model: Ordinal Logit | Df. Model: 5
Date: {0:%Y-%m-%d} | Df. Residuals: 395
Time: {0:%H:%M:%S} | Pseudo : 0.03
Std. Errors: Non-Robust | LL-Model: -358.51
| LL-Null: -370.60
| p (LR): <0.001*
============================================================
β (95% CI) p
------------------------------------------------------------
pared 1.05 (0.53 - 1.57) <0.001*

View File

@ -1,5 +1,5 @@
# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
@ -14,11 +14,11 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import pytest
from pytest import approx
import numpy as np
import pandas as pd
import statsmodels.api as sm
import yli
from yli.regress import CategoricalTerm
@ -31,7 +31,7 @@ def test_regress_ols_ol11_4():
'GrowthRet': [17.78, 21.59, 23.84, 15.13, 23.45, 20.87, 17.78, 20.09, 17.78, 12.46, 14.95, 15.87, 17.45, 14.35, 14.64, 17.25, 12.57, 7.15, 7.50, 4.34]
})
result = yli.regress(sm.OLS, df, 'GrowthRet', 'SoilPh')
result = yli.regress(yli.OLS, df, 'GrowthRet', 'SoilPh')
assert result.dof_model == 1
assert result.dof_resid == 18
@ -46,9 +46,27 @@ def test_regress_ols_ol11_4():
assert result.terms['SoilPh'].beta.ci_lower == approx(-10.15, abs=0.01)
assert result.terms['SoilPh'].beta.ci_upper == approx(-5.57, abs=0.01)
expected_summary = ''' Ordinary Least Squares Regression Results
=======================================================
Dep. Variable: GrowthRet | No. Observations: 20
Model: OLS | Df. Model: 1
Date: {0:%Y-%m-%d} | Df. Residuals: 18
Time: {0:%H:%M:%S} | : 0.74
Std. Errors: Non-Robust | F: 52.01
| p (F): <0.001*
=======================================================
β (95% CI) p
----------------------------------------------
(Intercept) 47.48 (38.17 - 56.78) <0.001*
SoilPh -7.86 (-10.15 - -5.57) <0.001*
----------------------------------------------'''.format(result.fitted_dt)
assert result.summary() == expected_summary
@pytest.mark.skip('Not implemented in refactored regression implementation')
def test_regress_bootstrap_ols_ol11_4():
"""Compare RegressionResult.bootstrap for Ott & Longnecker (2016) example 11.4/11.7"""
"""Compare RegressionModel.bootstrap for Ott & Longnecker (2016) example 11.4/11.7"""
df = pd.DataFrame({
'SoilPh': [3.3, 3.4, 3.4, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 5.0, 5.1, 5.2],
@ -86,7 +104,7 @@ def test_regress_ols_ol13_5():
})
df['LNC'] = np.log(df['C'])
result = yli.regress(sm.OLS, df, 'LNC', 'D + T1 + T2 + S + PR + NE + CT + BW + N + PT')
result = yli.regress(yli.OLS, df, 'LNC', 'D + T1 + T2 + S + PR + NE + CT + BW + N + PT')
assert result.dof_model == 10
assert result.dof_resid == 21
@ -126,7 +144,7 @@ def test_regress_logit_ol12_23():
'GammaGlobulin': [38, 36, 36, 36, 30, 31, 36, 37, 31, 38, 29, 46, 46, 31, 35, 37, 33, 37, 37, 34, 44, 28, 31, 39, 37, 39, 32, 38, 36, 32, 41, 30]
})
result = yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
result = yli.regress(yli.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
# Some numerical differences as intercept term is very negative
lrtest_result = result.lrtest_null()
@ -152,10 +170,10 @@ def test_regress_logit_ol12_23():
======================================================
Dep. Variable: Unhealthy | No. Observations: 32
Model: Logit | Df. Model: 2
Method: MLE | Df. Residuals: 29
Date: {0:%Y-%m-%d} | Pseudo : 0.26
Time: {0:%H:%M:%S} | LL-Model: -11.47
Std. Errors: Non-Robust | LL-Null: -15.44
Date: {0:%Y-%m-%d} | Df. Residuals: 29
Time: {0:%H:%M:%S} | Pseudo : 0.26
Std. Errors: Non-Robust | LL-Model: -11.47
| LL-Null: -15.44
| p (LR): 0.02*
======================================================
exp(β) (95% CI) p
@ -182,7 +200,7 @@ def test_regress_logit_ol10_18():
'Stress': np.repeat([d[1] for d in data], [d[2] for d in data])
})
result = yli.regress(sm.Logit, df, 'Stress', 'Response', bool_baselevels=True)
result = yli.regress(yli.Logit, df, 'Stress', 'Response', bool_baselevels=True)
assert isinstance(result.terms['Response'], CategoricalTerm)
assert result.terms['Response'].ref_category == False
@ -217,15 +235,15 @@ def test_regress_penalisedlogit_kleinman():
assert lrtest_result.dof == 1
assert lrtest_result.pvalue < 0.0001
expected_summary = ''' Penalised Logistic Regression Results
=========================================================
Dep. Variable: Outcome | No. Observations: 240
Model: Logit | Df. Model: 1
Method: Penalised ML | Pseudo : 0.37
Date: {0:%Y-%m-%d} | LL-Model: -66.43
Time: {0:%H:%M:%S} | LL-Null: -105.91
Std. Errors: Non-Robust | p (LR): <0.001*
=========================================================
expected_summary = ''' Penalised Logistic Regression Results
============================================================
Dep. Variable: Outcome | No. Observations: 240
Model: Penalised Logit | Df. Model: 1
Date: {0:%Y-%m-%d} | Pseudo : 0.37
Time: {0:%H:%M:%S} | LL-Model: -66.43
Std. Errors: Non-Robust | LL-Null: -105.91
| p (LR): <0.001*
============================================================
β (95% CI) p
---------------------------------------------
(Intercept) -2.28 (-2.77 - -1.85) <0.001*

View File

@ -20,9 +20,9 @@ from .descriptives import auto_correlations, auto_descriptives
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
from .graphs import init_fonts
from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, vif
from .regress import Logit, OLS, OrdinalLogit, PenalisedLogit, regress, vif
from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, spearman, ttest_ind
from .survival import cox_interval_censored, kaplanmeier, logrank, turnbull
from .survival import kaplanmeier, logrank, turnbull
from .utils import as_ordinal
def reload_me():

File diff suppressed because it is too large Load Diff

View File

@ -20,11 +20,7 @@ import statsmodels.api as sm
from .config import config
from .sig_tests import ChiSquaredResult
from .regress import RegressionResult, SingleTerm
from .utils import Estimate, check_nan, cols_for_formula, convert_pandas_nullable
from datetime import datetime
import weakref
from .utils import Estimate, check_nan
def kaplanmeier(df, time, status, by=None, *, ci=True, transform_x=None, transform_y=None, nan_policy='warn'):
"""
@ -276,91 +272,3 @@ def logrank(df, time, status, by, nan_policy='warn'):
statistic, pvalue = sm.duration.survdiff(df[time], df[status], df[by])
return ChiSquaredResult(statistic=statistic, dof=1, pvalue=pvalue)
# --------------------------------
# Interval-censored Cox regression
def cox_interval_censored(
df, time_left, time_right, formula, *,
bootstrap_samples=100,
nan_policy='warn',
bool_baselevels=False, exp=True,
):
# TODO: Documentation
df_ref = weakref.ref(df)
# Check for/clean NaNs in input columns
columns = [time_left, time_right] + cols_for_formula(formula, df)
df = df[columns]
df = check_nan(df, nan_policy)
# FIXME: Ensure numeric type for dependent variable
#df[dep], dep_categories = as_numeric(df[dep])
if df[time_left].dtype != 'float64' or df[time_right].dtype != 'float64':
raise NotImplementedError('Time dtypes must be float64')
# Convert pandas nullable types for independent variables
df = convert_pandas_nullable(df)
# ---------
# Fit model
# lifelines.CoxPHFitter doesn't do confidence intervals so we use R
import rpy2.robjects as ro
import rpy2.robjects.packages
import rpy2.robjects.pandas2ri
# Convert bool to int otherwise rpy2 chokes
df = df.replace({False: 0, True: 1})
# Import icenReg
ro.packages.importr('icenReg')
with ro.conversion.localconverter(ro.default_converter + ro.pandas2ri.converter):
with ro.local_context() as lc:
# Convert DataFrame to R
lc['df'] = df
# Transfer other parameters to R
lc['formula_'] = 'Surv({}, {}, type="interval2") ~ {}'.format(time_left, time_right, formula)
lc['bootstrap_samples'] = bootstrap_samples
# FIXME: Seed bootstrap RNG?
# Fit the model
ro.r('model <- ic_sp(as.formula(formula_), data=df, bs_samples=bootstrap_samples)')
model = ro.r('model')
# Hard to access attributes through rpy2
term_parameters = ro.r('model$coef')
term_names = ro.r('names(model$coef)')
term_cis = ro.r('confint(model)')
cov_matrix = ro.r('model$var')
llf = ro.r('model$llk')[0]
# TODO: Handle categorical terms?
terms = {}
for i in range(len(term_parameters)):
# These values not directly exposed so we must calculate them
se = np.sqrt(cov_matrix[i, i])
pvalue = 2 * stats.norm(loc=0, scale=se).cdf(-np.abs(term_parameters[i]))
term = SingleTerm(term_names[i], Estimate(term_parameters[i], term_cis[i][0], term_cis[i][1]), pvalue)
terms[term_names[i]] = term
result = RegressionResult(
None, df_ref, '({}, {}]'.format(time_left, time_right), formula, nan_policy, None, None,
model,
'Interval-Censored Cox Regression', 'CoxIC', 'MLE',
len(df), None, None, datetime.now(), 'Bootstrap',
terms,
llf, None,
None, None, None,
[],
exp
)
return result