Large refactor of yli.regress
This commit is contained in:
parent
dbfcec56c3
commit
ac2aca7b8f
@ -1,5 +1,5 @@
|
|||||||
# scipy-yli: Helpful SciPy utilities and recipes
|
# scipy-yli: Helpful SciPy utilities and recipes
|
||||||
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
|
# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
|
||||||
#
|
#
|
||||||
# This program is free software: you can redistribute it and/or modify
|
# This program is free software: you can redistribute it and/or modify
|
||||||
# it under the terms of the GNU Affero General Public License as published by
|
# it under the terms of the GNU Affero General Public License as published by
|
||||||
@ -17,7 +17,6 @@
|
|||||||
from pytest import approx
|
from pytest import approx
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import statsmodels.api as sm
|
|
||||||
|
|
||||||
import yli
|
import yli
|
||||||
|
|
||||||
@ -44,7 +43,7 @@ def test_regress_ftest_ol8_2():
|
|||||||
'Score': [96, 79, 91, 85, 83, 91, 82, 87, 77, 76, 74, 73, 78, 71, 80, 66, 73, 69, 66, 77, 73, 71, 70, 74]
|
'Score': [96, 79, 91, 85, 83, 91, 82, 87, 77, 76, 74, 73, 78, 71, 80, 66, 73, 69, 66, 77, 73, 71, 70, 74]
|
||||||
})
|
})
|
||||||
|
|
||||||
result = yli.regress(sm.OLS, df, 'Score', 'C(Method)').ftest()
|
result = yli.regress(yli.OLS, df, 'Score', 'C(Method)').ftest()
|
||||||
|
|
||||||
assert result.statistic == approx(545.316/18.4366, rel=0.001)
|
assert result.statistic == approx(545.316/18.4366, rel=0.001)
|
||||||
assert result.dof1 == 2
|
assert result.dof1 == 2
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
# scipy-yli: Helpful SciPy utilities and recipes
|
# scipy-yli: Helpful SciPy utilities and recipes
|
||||||
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
|
# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
|
||||||
#
|
#
|
||||||
# This program is free software: you can redistribute it and/or modify
|
# This program is free software: you can redistribute it and/or modify
|
||||||
# it under the terms of the GNU Affero General Public License as published by
|
# it under the terms of the GNU Affero General Public License as published by
|
||||||
@ -17,7 +17,6 @@
|
|||||||
from pytest import approx
|
from pytest import approx
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import statsmodels.api as sm
|
|
||||||
|
|
||||||
import yli
|
import yli
|
||||||
|
|
||||||
@ -30,7 +29,7 @@ def test_afbf_logit_beta_zero():
|
|||||||
'GammaGlobulin': [38, 36, 36, 36, 30, 31, 36, 37, 31, 38, 29, 46, 46, 31, 35, 37, 33, 37, 37, 34, 44, 28, 31, 39, 37, 39, 32, 38, 36, 32, 41, 30]
|
'GammaGlobulin': [38, 36, 36, 36, 30, 31, 36, 37, 31, 38, 29, 46, 46, 31, 35, 37, 33, 37, 37, 34, 44, 28, 31, 39, 37, 39, 32, 38, 36, 32, 41, 30]
|
||||||
})
|
})
|
||||||
|
|
||||||
result = yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
|
result = yli.regress(yli.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
|
||||||
|
|
||||||
# model <- glm(Unhealthy ~ Fibrinogen + GammaGlobulin, data=df, family=binomial())
|
# model <- glm(Unhealthy ~ Fibrinogen + GammaGlobulin, data=df, family=binomial())
|
||||||
# bf_fit <- BF(model, hypothesis="Fibrinogen = 0")
|
# bf_fit <- BF(model, hypothesis="Fibrinogen = 0")
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
# scipy-yli: Helpful SciPy utilities and recipes
|
# scipy-yli: Helpful SciPy utilities and recipes
|
||||||
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
|
# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
|
||||||
#
|
#
|
||||||
# This program is free software: you can redistribute it and/or modify
|
# This program is free software: you can redistribute it and/or modify
|
||||||
# it under the terms of the GNU Affero General Public License as published by
|
# it under the terms of the GNU Affero General Public License as published by
|
||||||
|
@ -42,15 +42,15 @@ def test_ordinallogit_ucla():
|
|||||||
assert result.terms['(Cutoffs)'].categories['somewhat likely/very likely'].beta.ci_upper == approx(5.875195, abs=0.001)
|
assert result.terms['(Cutoffs)'].categories['somewhat likely/very likely'].beta.ci_upper == approx(5.875195, abs=0.001)
|
||||||
|
|
||||||
expected_summary = ''' Ordinal Logistic Regression Results
|
expected_summary = ''' Ordinal Logistic Regression Results
|
||||||
===============================================================
|
==========================================================
|
||||||
Dep. Variable: apply | No. Observations: 400
|
Dep. Variable: apply | No. Observations: 400
|
||||||
Model: OrdinalLogit | Df. Model: 5
|
Model: Ordinal Logit | Df. Model: 5
|
||||||
Method: Maximum Likelihood | Df. Residuals: 395
|
Date: {0:%Y-%m-%d} | Df. Residuals: 395
|
||||||
Date: {0:%Y-%m-%d} | Pseudo R²: 0.03
|
Time: {0:%H:%M:%S} | Pseudo R²: 0.03
|
||||||
Time: {0:%H:%M:%S} | LL-Model: -358.51
|
Std. Errors: Non-Robust | LL-Model: -358.51
|
||||||
Std. Errors: Non-Robust | LL-Null: -370.60
|
| LL-Null: -370.60
|
||||||
| p (LR): <0.001*
|
| p (LR): <0.001*
|
||||||
===============================================================
|
============================================================
|
||||||
β (95% CI) p
|
β (95% CI) p
|
||||||
------------------------------------------------------------
|
------------------------------------------------------------
|
||||||
pared 1.05 (0.53 - 1.57) <0.001*
|
pared 1.05 (0.53 - 1.57) <0.001*
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
# scipy-yli: Helpful SciPy utilities and recipes
|
# scipy-yli: Helpful SciPy utilities and recipes
|
||||||
# Copyright © 2022 Lee Yingtong Li (RunasSudo)
|
# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
|
||||||
#
|
#
|
||||||
# This program is free software: you can redistribute it and/or modify
|
# This program is free software: you can redistribute it and/or modify
|
||||||
# it under the terms of the GNU Affero General Public License as published by
|
# it under the terms of the GNU Affero General Public License as published by
|
||||||
@ -14,11 +14,11 @@
|
|||||||
# You should have received a copy of the GNU Affero General Public License
|
# You should have received a copy of the GNU Affero General Public License
|
||||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
import pytest
|
||||||
from pytest import approx
|
from pytest import approx
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import statsmodels.api as sm
|
|
||||||
|
|
||||||
import yli
|
import yli
|
||||||
from yli.regress import CategoricalTerm
|
from yli.regress import CategoricalTerm
|
||||||
@ -31,7 +31,7 @@ def test_regress_ols_ol11_4():
|
|||||||
'GrowthRet': [17.78, 21.59, 23.84, 15.13, 23.45, 20.87, 17.78, 20.09, 17.78, 12.46, 14.95, 15.87, 17.45, 14.35, 14.64, 17.25, 12.57, 7.15, 7.50, 4.34]
|
'GrowthRet': [17.78, 21.59, 23.84, 15.13, 23.45, 20.87, 17.78, 20.09, 17.78, 12.46, 14.95, 15.87, 17.45, 14.35, 14.64, 17.25, 12.57, 7.15, 7.50, 4.34]
|
||||||
})
|
})
|
||||||
|
|
||||||
result = yli.regress(sm.OLS, df, 'GrowthRet', 'SoilPh')
|
result = yli.regress(yli.OLS, df, 'GrowthRet', 'SoilPh')
|
||||||
|
|
||||||
assert result.dof_model == 1
|
assert result.dof_model == 1
|
||||||
assert result.dof_resid == 18
|
assert result.dof_resid == 18
|
||||||
@ -47,8 +47,26 @@ def test_regress_ols_ol11_4():
|
|||||||
assert result.terms['SoilPh'].beta.ci_lower == approx(-10.15, abs=0.01)
|
assert result.terms['SoilPh'].beta.ci_lower == approx(-10.15, abs=0.01)
|
||||||
assert result.terms['SoilPh'].beta.ci_upper == approx(-5.57, abs=0.01)
|
assert result.terms['SoilPh'].beta.ci_upper == approx(-5.57, abs=0.01)
|
||||||
|
|
||||||
|
expected_summary = ''' Ordinary Least Squares Regression Results
|
||||||
|
=======================================================
|
||||||
|
Dep. Variable: GrowthRet | No. Observations: 20
|
||||||
|
Model: OLS | Df. Model: 1
|
||||||
|
Date: {0:%Y-%m-%d} | Df. Residuals: 18
|
||||||
|
Time: {0:%H:%M:%S} | R²: 0.74
|
||||||
|
Std. Errors: Non-Robust | F: 52.01
|
||||||
|
| p (F): <0.001*
|
||||||
|
=======================================================
|
||||||
|
β (95% CI) p
|
||||||
|
----------------------------------------------
|
||||||
|
(Intercept) 47.48 (38.17 - 56.78) <0.001*
|
||||||
|
SoilPh -7.86 (-10.15 - -5.57) <0.001*
|
||||||
|
----------------------------------------------'''.format(result.fitted_dt)
|
||||||
|
|
||||||
|
assert result.summary() == expected_summary
|
||||||
|
|
||||||
|
@pytest.mark.skip('Not implemented in refactored regression implementation')
|
||||||
def test_regress_bootstrap_ols_ol11_4():
|
def test_regress_bootstrap_ols_ol11_4():
|
||||||
"""Compare RegressionResult.bootstrap for Ott & Longnecker (2016) example 11.4/11.7"""
|
"""Compare RegressionModel.bootstrap for Ott & Longnecker (2016) example 11.4/11.7"""
|
||||||
|
|
||||||
df = pd.DataFrame({
|
df = pd.DataFrame({
|
||||||
'SoilPh': [3.3, 3.4, 3.4, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 5.0, 5.1, 5.2],
|
'SoilPh': [3.3, 3.4, 3.4, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 5.0, 5.1, 5.2],
|
||||||
@ -86,7 +104,7 @@ def test_regress_ols_ol13_5():
|
|||||||
})
|
})
|
||||||
df['LNC'] = np.log(df['C'])
|
df['LNC'] = np.log(df['C'])
|
||||||
|
|
||||||
result = yli.regress(sm.OLS, df, 'LNC', 'D + T1 + T2 + S + PR + NE + CT + BW + N + PT')
|
result = yli.regress(yli.OLS, df, 'LNC', 'D + T1 + T2 + S + PR + NE + CT + BW + N + PT')
|
||||||
|
|
||||||
assert result.dof_model == 10
|
assert result.dof_model == 10
|
||||||
assert result.dof_resid == 21
|
assert result.dof_resid == 21
|
||||||
@ -126,7 +144,7 @@ def test_regress_logit_ol12_23():
|
|||||||
'GammaGlobulin': [38, 36, 36, 36, 30, 31, 36, 37, 31, 38, 29, 46, 46, 31, 35, 37, 33, 37, 37, 34, 44, 28, 31, 39, 37, 39, 32, 38, 36, 32, 41, 30]
|
'GammaGlobulin': [38, 36, 36, 36, 30, 31, 36, 37, 31, 38, 29, 46, 46, 31, 35, 37, 33, 37, 37, 34, 44, 28, 31, 39, 37, 39, 32, 38, 36, 32, 41, 30]
|
||||||
})
|
})
|
||||||
|
|
||||||
result = yli.regress(sm.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
|
result = yli.regress(yli.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
|
||||||
|
|
||||||
# Some numerical differences as intercept term is very negative
|
# Some numerical differences as intercept term is very negative
|
||||||
lrtest_result = result.lrtest_null()
|
lrtest_result = result.lrtest_null()
|
||||||
@ -152,10 +170,10 @@ def test_regress_logit_ol12_23():
|
|||||||
======================================================
|
======================================================
|
||||||
Dep. Variable: Unhealthy | No. Observations: 32
|
Dep. Variable: Unhealthy | No. Observations: 32
|
||||||
Model: Logit | Df. Model: 2
|
Model: Logit | Df. Model: 2
|
||||||
Method: MLE | Df. Residuals: 29
|
Date: {0:%Y-%m-%d} | Df. Residuals: 29
|
||||||
Date: {0:%Y-%m-%d} | Pseudo R²: 0.26
|
Time: {0:%H:%M:%S} | Pseudo R²: 0.26
|
||||||
Time: {0:%H:%M:%S} | LL-Model: -11.47
|
Std. Errors: Non-Robust | LL-Model: -11.47
|
||||||
Std. Errors: Non-Robust | LL-Null: -15.44
|
| LL-Null: -15.44
|
||||||
| p (LR): 0.02*
|
| p (LR): 0.02*
|
||||||
======================================================
|
======================================================
|
||||||
exp(β) (95% CI) p
|
exp(β) (95% CI) p
|
||||||
@ -182,7 +200,7 @@ def test_regress_logit_ol10_18():
|
|||||||
'Stress': np.repeat([d[1] for d in data], [d[2] for d in data])
|
'Stress': np.repeat([d[1] for d in data], [d[2] for d in data])
|
||||||
})
|
})
|
||||||
|
|
||||||
result = yli.regress(sm.Logit, df, 'Stress', 'Response', bool_baselevels=True)
|
result = yli.regress(yli.Logit, df, 'Stress', 'Response', bool_baselevels=True)
|
||||||
|
|
||||||
assert isinstance(result.terms['Response'], CategoricalTerm)
|
assert isinstance(result.terms['Response'], CategoricalTerm)
|
||||||
assert result.terms['Response'].ref_category == False
|
assert result.terms['Response'].ref_category == False
|
||||||
@ -218,14 +236,14 @@ def test_regress_penalisedlogit_kleinman():
|
|||||||
assert lrtest_result.pvalue < 0.0001
|
assert lrtest_result.pvalue < 0.0001
|
||||||
|
|
||||||
expected_summary = ''' Penalised Logistic Regression Results
|
expected_summary = ''' Penalised Logistic Regression Results
|
||||||
=========================================================
|
============================================================
|
||||||
Dep. Variable: Outcome | No. Observations: 240
|
Dep. Variable: Outcome | No. Observations: 240
|
||||||
Model: Logit | Df. Model: 1
|
Model: Penalised Logit | Df. Model: 1
|
||||||
Method: Penalised ML | Pseudo R²: 0.37
|
Date: {0:%Y-%m-%d} | Pseudo R²: 0.37
|
||||||
Date: {0:%Y-%m-%d} | LL-Model: -66.43
|
Time: {0:%H:%M:%S} | LL-Model: -66.43
|
||||||
Time: {0:%H:%M:%S} | LL-Null: -105.91
|
Std. Errors: Non-Robust | LL-Null: -105.91
|
||||||
Std. Errors: Non-Robust | p (LR): <0.001*
|
| p (LR): <0.001*
|
||||||
=========================================================
|
============================================================
|
||||||
β (95% CI) p
|
β (95% CI) p
|
||||||
---------------------------------------------
|
---------------------------------------------
|
||||||
(Intercept) -2.28 (-2.77 - -1.85) <0.001*
|
(Intercept) -2.28 (-2.77 - -1.85) <0.001*
|
||||||
|
@ -20,9 +20,9 @@ from .descriptives import auto_correlations, auto_descriptives
|
|||||||
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
|
from .distributions import beta_oddsratio, beta_ratio, hdi, transformed_dist
|
||||||
from .graphs import init_fonts
|
from .graphs import init_fonts
|
||||||
from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
|
from .io import pickle_read_compressed, pickle_read_encrypted, pickle_write_compressed, pickle_write_encrypted
|
||||||
from .regress import OrdinalLogit, PenalisedLogit, logit_then_regress, regress, vif
|
from .regress import Logit, OLS, OrdinalLogit, PenalisedLogit, regress, vif
|
||||||
from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, spearman, ttest_ind
|
from .sig_tests import anova_oneway, auto_univariable, chi2, mannwhitney, pearsonr, spearman, ttest_ind
|
||||||
from .survival import cox_interval_censored, kaplanmeier, logrank, turnbull
|
from .survival import kaplanmeier, logrank, turnbull
|
||||||
from .utils import as_ordinal
|
from .utils import as_ordinal
|
||||||
|
|
||||||
def reload_me():
|
def reload_me():
|
||||||
|
1432
yli/regress.py
1432
yli/regress.py
File diff suppressed because it is too large
Load Diff
@ -20,11 +20,7 @@ import statsmodels.api as sm
|
|||||||
|
|
||||||
from .config import config
|
from .config import config
|
||||||
from .sig_tests import ChiSquaredResult
|
from .sig_tests import ChiSquaredResult
|
||||||
from .regress import RegressionResult, SingleTerm
|
from .utils import Estimate, check_nan
|
||||||
from .utils import Estimate, check_nan, cols_for_formula, convert_pandas_nullable
|
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
import weakref
|
|
||||||
|
|
||||||
def kaplanmeier(df, time, status, by=None, *, ci=True, transform_x=None, transform_y=None, nan_policy='warn'):
|
def kaplanmeier(df, time, status, by=None, *, ci=True, transform_x=None, transform_y=None, nan_policy='warn'):
|
||||||
"""
|
"""
|
||||||
@ -276,91 +272,3 @@ def logrank(df, time, status, by, nan_policy='warn'):
|
|||||||
statistic, pvalue = sm.duration.survdiff(df[time], df[status], df[by])
|
statistic, pvalue = sm.duration.survdiff(df[time], df[status], df[by])
|
||||||
|
|
||||||
return ChiSquaredResult(statistic=statistic, dof=1, pvalue=pvalue)
|
return ChiSquaredResult(statistic=statistic, dof=1, pvalue=pvalue)
|
||||||
|
|
||||||
# --------------------------------
|
|
||||||
# Interval-censored Cox regression
|
|
||||||
|
|
||||||
def cox_interval_censored(
|
|
||||||
df, time_left, time_right, formula, *,
|
|
||||||
bootstrap_samples=100,
|
|
||||||
nan_policy='warn',
|
|
||||||
bool_baselevels=False, exp=True,
|
|
||||||
):
|
|
||||||
# TODO: Documentation
|
|
||||||
|
|
||||||
df_ref = weakref.ref(df)
|
|
||||||
|
|
||||||
# Check for/clean NaNs in input columns
|
|
||||||
columns = [time_left, time_right] + cols_for_formula(formula, df)
|
|
||||||
|
|
||||||
df = df[columns]
|
|
||||||
df = check_nan(df, nan_policy)
|
|
||||||
|
|
||||||
# FIXME: Ensure numeric type for dependent variable
|
|
||||||
#df[dep], dep_categories = as_numeric(df[dep])
|
|
||||||
if df[time_left].dtype != 'float64' or df[time_right].dtype != 'float64':
|
|
||||||
raise NotImplementedError('Time dtypes must be float64')
|
|
||||||
|
|
||||||
# Convert pandas nullable types for independent variables
|
|
||||||
df = convert_pandas_nullable(df)
|
|
||||||
|
|
||||||
# ---------
|
|
||||||
# Fit model
|
|
||||||
|
|
||||||
# lifelines.CoxPHFitter doesn't do confidence intervals so we use R
|
|
||||||
|
|
||||||
import rpy2.robjects as ro
|
|
||||||
import rpy2.robjects.packages
|
|
||||||
import rpy2.robjects.pandas2ri
|
|
||||||
|
|
||||||
# Convert bool to int otherwise rpy2 chokes
|
|
||||||
df = df.replace({False: 0, True: 1})
|
|
||||||
|
|
||||||
# Import icenReg
|
|
||||||
ro.packages.importr('icenReg')
|
|
||||||
|
|
||||||
with ro.conversion.localconverter(ro.default_converter + ro.pandas2ri.converter):
|
|
||||||
with ro.local_context() as lc:
|
|
||||||
# Convert DataFrame to R
|
|
||||||
lc['df'] = df
|
|
||||||
|
|
||||||
# Transfer other parameters to R
|
|
||||||
lc['formula_'] = 'Surv({}, {}, type="interval2") ~ {}'.format(time_left, time_right, formula)
|
|
||||||
lc['bootstrap_samples'] = bootstrap_samples
|
|
||||||
|
|
||||||
# FIXME: Seed bootstrap RNG?
|
|
||||||
|
|
||||||
# Fit the model
|
|
||||||
ro.r('model <- ic_sp(as.formula(formula_), data=df, bs_samples=bootstrap_samples)')
|
|
||||||
|
|
||||||
model = ro.r('model')
|
|
||||||
# Hard to access attributes through rpy2
|
|
||||||
term_parameters = ro.r('model$coef')
|
|
||||||
term_names = ro.r('names(model$coef)')
|
|
||||||
term_cis = ro.r('confint(model)')
|
|
||||||
cov_matrix = ro.r('model$var')
|
|
||||||
llf = ro.r('model$llk')[0]
|
|
||||||
|
|
||||||
# TODO: Handle categorical terms?
|
|
||||||
terms = {}
|
|
||||||
for i in range(len(term_parameters)):
|
|
||||||
# These values not directly exposed so we must calculate them
|
|
||||||
se = np.sqrt(cov_matrix[i, i])
|
|
||||||
pvalue = 2 * stats.norm(loc=0, scale=se).cdf(-np.abs(term_parameters[i]))
|
|
||||||
|
|
||||||
term = SingleTerm(term_names[i], Estimate(term_parameters[i], term_cis[i][0], term_cis[i][1]), pvalue)
|
|
||||||
terms[term_names[i]] = term
|
|
||||||
|
|
||||||
result = RegressionResult(
|
|
||||||
None, df_ref, '({}, {}]'.format(time_left, time_right), formula, nan_policy, None, None,
|
|
||||||
model,
|
|
||||||
'Interval-Censored Cox Regression', 'CoxIC', 'MLE',
|
|
||||||
len(df), None, None, datetime.now(), 'Bootstrap',
|
|
||||||
terms,
|
|
||||||
llf, None,
|
|
||||||
None, None, None,
|
|
||||||
[],
|
|
||||||
exp
|
|
||||||
)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
Loading…
Reference in New Issue
Block a user