Large refactor of yli.regress
This commit is contained in:
parent
dbfcec56c3
commit
ac2aca7b8f
@ -1,5 +1,5 @@
|
||||
# scipy-yli: Helpful SciPy utilities and recipes
|
||||
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
|
||||
# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
@ -17,7 +17,6 @@
|
||||
from pytest import approx
|
||||
|
||||
import pandas as pd
|
||||
import statsmodels.api as sm
|
||||
|
||||
import yli
|
||||
|
||||
@ -44,7 +43,7 @@ def test_regress_ftest_ol8_2():
|
||||
'Score': [96, 79, 91, 85, 83, 91, 82, 87, 77, 76, 74, 73, 78, 71, 80, 66, 73, 69, 66, 77, 73, 71, 70, 74]
|
||||
})
|
||||
|
||||
result = yli.regress(sm.OLS, df, 'Score', 'C(Method)').ftest()
|
||||
result = yli.regress(yli.OLS, df, 'Score', 'C(Method)').ftest()
|
||||
|
||||
assert result.statistic == approx(545.316/18.4366, rel=0.001)
|
||||
assert result.dof1 == 2
|
||||
|
@ -1,5 +1,5 @@
|
||||
# scipy-yli: Helpful SciPy utilities and recipes
|
||||
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
|
||||
# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
@ -17,7 +17,6 @@
|
||||
from pytest import approx
|
||||
|
||||
import pandas as pd
|
||||
import statsmodels.api as sm
|
||||
|
||||
import yli
|
||||
|
||||
@ -30,7 +29,7 @@ def test_afbf_logit_beta_zero():
|
||||
'GammaGlobulin': [38, 36, 36, 36, 30, 31, 36, 37, 31, 38, 29, 46, 46, 31, 35, 37, 33, 37, 37, 34, 44, 28, 31, 39, 37, 39, 32, 38, 36, 32, 41, 30]
|
||||
})
|
||||
|
||||
result = yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
|
||||
result = yli.regress(yli.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
|
||||
|
||||
# model <- glm(Unhealthy ~ Fibrinogen + GammaGlobulin, data=df, family=binomial())
|
||||
# bf_fit <- BF(model, hypothesis="Fibrinogen = 0")
|
||||
|
@ -1,5 +1,5 @@
|
||||
# scipy-yli: Helpful SciPy utilities and recipes
|
||||
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
|
||||
# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
|
@ -42,15 +42,15 @@ def test_ordinallogit_ucla():
|
||||
assert result.terms['(Cutoffs)'].categories['somewhat likely/very likely'].beta.ci_upper == approx(5.875195, abs=0.001)
|
||||
|
||||
expected_summary = ''' Ordinal Logistic Regression Results
|
||||
===============================================================
|
||||
==========================================================
|
||||
Dep. Variable: apply | No. Observations: 400
|
||||
Model: Ordinal Logit | Df. Model: 5
|
||||
Method: Maximum Likelihood | Df. Residuals: 395
|
||||
Date: {0:%Y-%m-%d} | Pseudo R²: 0.03
|
||||
Time: {0:%H:%M:%S} | LL-Model: -358.51
|
||||
Std. Errors: Non-Robust | LL-Null: -370.60
|
||||
Date: {0:%Y-%m-%d} | Df. Residuals: 395
|
||||
Time: {0:%H:%M:%S} | Pseudo R²: 0.03
|
||||
Std. Errors: Non-Robust | LL-Model: -358.51
|
||||
| LL-Null: -370.60
|
||||
| p (LR): <0.001*
|
||||
===============================================================
|
||||
============================================================
|
||||
β (95% CI) p
|
||||
------------------------------------------------------------
|
||||
pared 1.05 (0.53 - 1.57) <0.001*
|
||||
|
@ -1,5 +1,5 @@
|
||||
# scipy-yli: Helpful SciPy utilities and recipes
|
||||
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
|
||||
# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
@ -14,11 +14,11 @@
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import pytest
|
||||
from pytest import approx
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import statsmodels.api as sm
|
||||
|
||||
import yli
|
||||
from yli.regress import CategoricalTerm
|
||||
@ -31,7 +31,7 @@ def test_regress_ols_ol11_4():
|
||||
'GrowthRet': [17.78, 21.59, 23.84, 15.13, 23.45, 20.87, 17.78, 20.09, 17.78, 12.46, 14.95, 15.87, 17.45, 14.35, 14.64, 17.25, 12.57, 7.15, 7.50, 4.34]
|
||||
})
|
||||
|
||||
result = yli.regress(sm.OLS, df, 'GrowthRet', 'SoilPh')
|
||||
result = yli.regress(yli.OLS, df, 'GrowthRet', 'SoilPh')
|
||||
|
||||
assert result.dof_model == 1
|
||||
assert result.dof_resid == 18
|
||||
@ -47,8 +47,26 @@ def test_regress_ols_ol11_4():
|
||||
assert result.terms['SoilPh'].beta.ci_lower == approx(-10.15, abs=0.01)
|
||||
assert result.terms['SoilPh'].beta.ci_upper == approx(-5.57, abs=0.01)
|
||||
|
||||
expected_summary = ''' Ordinary Least Squares Regression Results
|
||||
=======================================================
|
||||
Dep. Variable: GrowthRet | No. Observations: 20
|
||||
Model: OLS | Df. Model: 1
|
||||
Date: {0:%Y-%m-%d} | Df. Residuals: 18
|
||||
Time: {0:%H:%M:%S} | R²: 0.74
|
||||
Std. Errors: Non-Robust | F: 52.01
|
||||
| p (F): <0.001*
|
||||
=======================================================
|
||||
β (95% CI) p
|
||||
----------------------------------------------
|
||||
(Intercept) 47.48 (38.17 - 56.78) <0.001*
|
||||
SoilPh -7.86 (-10.15 - -5.57) <0.001*
|
||||
----------------------------------------------'''.format(result.fitted_dt)
|
||||
|
||||
assert result.summary() == expected_summary
|
||||
|
||||
@pytest.mark.skip('Not implemented in refactored regression implementation')
|
||||
def test_regress_bootstrap_ols_ol11_4():
|
||||
"""Compare RegressionResult.bootstrap for Ott & Longnecker (2016) example 11.4/11.7"""
|
||||
"""Compare RegressionModel.bootstrap for Ott & Longnecker (2016) example 11.4/11.7"""
|
||||
|
||||
df = pd.DataFrame({
|
||||
'SoilPh': [3.3, 3.4, 3.4, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 5.0, 5.1, 5.2],
|
||||
@ -86,7 +104,7 @@ def test_regress_ols_ol13_5():
|
||||
})
|
||||
df['LNC'] = np.log(df['C'])
|
||||
|
||||
result = yli.regress(sm.OLS, df, 'LNC', 'D + T1 + T2 + S + PR + NE + CT + BW + N + PT')
|
||||
result = yli.regress(yli.OLS, df, 'LNC', 'D + T1 + T2 + S + PR + NE + CT + BW + N + PT')
|
||||
|
||||
assert result.dof_model == 10
|
||||
assert result.dof_resid == 21
|
||||
@ -126,7 +144,7 @@ def test_regress_logit_ol12_23():
|
||||
'GammaGlobulin': [38, 36, 36, 36, 30, 31, 36, 37, 31, 38, 29, 46, 46, 31, 35, 37, 33, 37, 37, 34, 44, 28, 31, 39, 37, 39, 32, 38, 36, 32, 41, 30]
|
||||
})
|
||||
|
||||
result = yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
|
||||
result = yli.regress(yli.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
|
||||
|
||||
# Some numerical differences as intercept term is very negative
|
||||
lrtest_result = result.lrtest_null()
|
||||
@ -152,10 +170,10 @@ def test_regress_logit_ol12_23():
|
||||
======================================================
|
||||
Dep. Variable: Unhealthy | No. Observations: 32
|
||||
Model: Logit | Df. Model: 2
|
||||
Method: MLE | Df. Residuals: 29
|
||||
Date: {0:%Y-%m-%d} | Pseudo R²: 0.26
|
||||
Time: {0:%H:%M:%S} | LL-Model: -11.47
|
||||
Std. Errors: Non-Robust | LL-Null: -15.44
|
||||
Date: {0:%Y-%m-%d} | Df. Residuals: 29
|
||||
Time: {0:%H:%M:%S} | Pseudo R²: 0.26
|
||||
Std. Errors: Non-Robust | LL-Model: -11.47
|
||||
| LL-Null: -15.44
|
||||
| p (LR): 0.02*
|
||||
======================================================
|
||||
exp(β) (95% CI) p
|
||||
@ -182,7 +200,7 @@ def test_regress_logit_ol10_18():
|
||||
'Stress': np.repeat([d[1] for d in data], [d[2] for d in data])
|
||||
})
|
||||
|
||||
result = yli.regress(sm.Logit, df, 'Stress', 'Response', bool_baselevels=True)
|
||||
result = yli.regress(yli.Logit, df, 'Stress', 'Response', bool_baselevels=True)
|
||||
|
||||
assert isinstance(result.terms['Response'], CategoricalTerm)
|
||||
assert result.terms['Response'].ref_category == False
|
||||
@ -218,14 +236,14 @@ def test_regress_penalisedlogit_kleinman():
|
||||
assert lrtest_result.pvalue < 0.0001
|
||||
|
||||
expected_summary = ''' Penalised Logistic Regression Results
|
||||
=========================================================
|
||||
============================================================
|
||||
Dep. Variable: Outcome | No. Observations: 240
|
||||
Model: Logit | Df. Model: 1
|
||||
Method: Penalised ML | Pseudo R²: 0.37
|
||||
Date: {0:%Y-%m-%d} | LL-Model: -66.43
|
||||
Time: {0:%H:%M:%S} | LL-Null: -105.91
|
||||
Std. Errors: Non-Robust | p (LR): <0.001*
|
||||
=========================================================
|
||||
Model: Penalised Logit | Df. Model: 1
|
||||
Date: {0:%Y-%m-%d} | Pseudo R²: 0.37
|
||||
Time: {0:%H:%M:%S} | LL-Model: -66.43
|
||||
Std. Errors: Non-Robust | LL-Null: -105.91
|
||||
| p (LR): <0.001*
|
||||
============================================================
|
||||
β (95% CI) p
|
||||
---------------------------------------------
|
||||
(Intercept) -2.28 (-2.77 - -1.85) <0.001*
|
||||
|
@ -20,9 +20,9 @@ from .descriptives import auto_correlations, auto_descriptives
|
||||
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
|
||||
from .graphs import init_fonts
|
||||
from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
|
||||
from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, vif
|
||||
from .regress import Logit, OLS, OrdinalLogit, PenalisedLogit, regress, vif
|
||||
from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, spearman, ttest_ind
|
||||
from .survival import cox_interval_censored, kaplanmeier, logrank, turnbull
|
||||
from .survival import kaplanmeier, logrank, turnbull
|
||||
from .utils import as_ordinal
|
||||
|
||||
def reload_me():
|
||||
|
1432
yli/regress.py
1432
yli/regress.py
File diff suppressed because it is too large
Load Diff
@ -20,11 +20,7 @@ import statsmodels.api as sm
|
||||
|
||||
from .config import config
|
||||
from .sig_tests import ChiSquaredResult
|
||||
from .regress import RegressionResult, SingleTerm
|
||||
from .utils import Estimate, check_nan, cols_for_formula, convert_pandas_nullable
|
||||
|
||||
from datetime import datetime
|
||||
import weakref
|
||||
from .utils import Estimate, check_nan
|
||||
|
||||
def kaplanmeier(df, time, status, by=None, *, ci=True, transform_x=None, transform_y=None, nan_policy='warn'):
|
||||
"""
|
||||
@ -276,91 +272,3 @@ def logrank(df, time, status, by, nan_policy='warn'):
|
||||
statistic, pvalue = sm.duration.survdiff(df[time], df[status], df[by])
|
||||
|
||||
return ChiSquaredResult(statistic=statistic, dof=1, pvalue=pvalue)
|
||||
|
||||
# --------------------------------
|
||||
# Interval-censored Cox regression
|
||||
|
||||
def cox_interval_censored(
|
||||
df, time_left, time_right, formula, *,
|
||||
bootstrap_samples=100,
|
||||
nan_policy='warn',
|
||||
bool_baselevels=False, exp=True,
|
||||
):
|
||||
# TODO: Documentation
|
||||
|
||||
df_ref = weakref.ref(df)
|
||||
|
||||
# Check for/clean NaNs in input columns
|
||||
columns = [time_left, time_right] + cols_for_formula(formula, df)
|
||||
|
||||
df = df[columns]
|
||||
df = check_nan(df, nan_policy)
|
||||
|
||||
# FIXME: Ensure numeric type for dependent variable
|
||||
#df[dep], dep_categories = as_numeric(df[dep])
|
||||
if df[time_left].dtype != 'float64' or df[time_right].dtype != 'float64':
|
||||
raise NotImplementedError('Time dtypes must be float64')
|
||||
|
||||
# Convert pandas nullable types for independent variables
|
||||
df = convert_pandas_nullable(df)
|
||||
|
||||
# ---------
|
||||
# Fit model
|
||||
|
||||
# lifelines.CoxPHFitter doesn't do confidence intervals so we use R
|
||||
|
||||
import rpy2.robjects as ro
|
||||
import rpy2.robjects.packages
|
||||
import rpy2.robjects.pandas2ri
|
||||
|
||||
# Convert bool to int otherwise rpy2 chokes
|
||||
df = df.replace({False: 0, True: 1})
|
||||
|
||||
# Import icenReg
|
||||
ro.packages.importr('icenReg')
|
||||
|
||||
with ro.conversion.localconverter(ro.default_converter + ro.pandas2ri.converter):
|
||||
with ro.local_context() as lc:
|
||||
# Convert DataFrame to R
|
||||
lc['df'] = df
|
||||
|
||||
# Transfer other parameters to R
|
||||
lc['formula_'] = 'Surv({}, {}, type="interval2") ~ {}'.format(time_left, time_right, formula)
|
||||
lc['bootstrap_samples'] = bootstrap_samples
|
||||
|
||||
# FIXME: Seed bootstrap RNG?
|
||||
|
||||
# Fit the model
|
||||
ro.r('model <- ic_sp(as.formula(formula_), data=df, bs_samples=bootstrap_samples)')
|
||||
|
||||
model = ro.r('model')
|
||||
# Hard to access attributes through rpy2
|
||||
term_parameters = ro.r('model$coef')
|
||||
term_names = ro.r('names(model$coef)')
|
||||
term_cis = ro.r('confint(model)')
|
||||
cov_matrix = ro.r('model$var')
|
||||
llf = ro.r('model$llk')[0]
|
||||
|
||||
# TODO: Handle categorical terms?
|
||||
terms = {}
|
||||
for i in range(len(term_parameters)):
|
||||
# These values not directly exposed so we must calculate them
|
||||
se = np.sqrt(cov_matrix[i, i])
|
||||
pvalue = 2 * stats.norm(loc=0, scale=se).cdf(-np.abs(term_parameters[i]))
|
||||
|
||||
term = SingleTerm(term_names[i], Estimate(term_parameters[i], term_cis[i][0], term_cis[i][1]), pvalue)
|
||||
terms[term_names[i]] = term
|
||||
|
||||
result = RegressionResult(
|
||||
None, df_ref, '({}, {}]'.format(time_left, time_right), formula, nan_policy, None, None,
|
||||
model,
|
||||
'Interval-Censored Cox Regression', 'CoxIC', 'MLE',
|
||||
len(df), None, None, datetime.now(), 'Bootstrap',
|
||||
terms,
|
||||
llf, None,
|
||||
None, None, None,
|
||||
[],
|
||||
exp
|
||||
)
|
||||
|
||||
return result
|
||||
|
Loading…
Reference in New Issue
Block a user