scipy-yli/tests/test_regress.py

257 lines
16 KiB
Python

# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import pytest
from pytest import approx
import numpy as np
import pandas as pd
import yli
from yli.regress import CategoricalTerm
def test_regress_ols_ol11_4():
"""Compare yli.regress for Ott & Longnecker (2016) example 11.4/11.7"""
df = pd.DataFrame({
'SoilPh': [3.3, 3.4, 3.4, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 5.0, 5.1, 5.2],
'GrowthRet': [17.78, 21.59, 23.84, 15.13, 23.45, 20.87, 17.78, 20.09, 17.78, 12.46, 14.95, 15.87, 17.45, 14.35, 14.64, 17.25, 12.57, 7.15, 7.50, 4.34]
})
result = yli.regress(yli.OLS, df, 'GrowthRet', 'SoilPh')
assert result.dof_model == 1
assert result.dof_resid == 18
assert result.f_statistic == approx(52.01, abs=0.01)
assert result.ftest().pvalue < 0.0005
assert result.rsquared == approx(0.7429, abs=0.0001)
assert result.terms['(Intercept)'].beta.point == approx(47.48, abs=0.01)
assert result.terms['(Intercept)'].pvalue < 0.0005
assert result.terms['SoilPh'].beta.point == approx(-7.86, abs=0.01)
assert result.terms['SoilPh'].pvalue < 0.0005
assert result.terms['SoilPh'].beta.ci_lower == approx(-10.15, abs=0.01)
assert result.terms['SoilPh'].beta.ci_upper == approx(-5.57, abs=0.01)
expected_summary = ''' Ordinary Least Squares Regression Results
=======================================================
Dep. Variable: GrowthRet | No. Observations: 20
Model: OLS | Df. Model: 1
Date: {0:%Y-%m-%d} | Df. Residuals: 18
Time: {0:%H:%M:%S} | R²: 0.74
Std. Errors: Non-Robust | F: 52.01
| p (F): <0.001*
=======================================================
β (95% CI) p
----------------------------------------------
(Intercept) 47.48 (38.17 - 56.78) <0.001*
SoilPh -7.86 (-10.15 - -5.57) <0.001*
----------------------------------------------'''.format(result.fitted_dt)
assert result.summary() == expected_summary
assert result._repr_html_() == '<table><caption>Ordinary Least Squares Regression Results</caption><tr><th>Dep. Variable:</th><td>GrowthRet</td><th>No. Observations:</th><td>20</td></tr><tr><th>Model:</th><td>OLS</td><th>Df. Model:</th><td>1</td></tr><tr><th>Date:</th><td>{0:%Y-%m-%d}</td><th>Df. Residuals:</th><td>18</td></tr><tr><th>Time:</th><td>{0:%H:%M:%S}</td><th><i>R</i><sup>2</sup>:</th><td>0.74</td></tr><tr><th>Std. Errors:</th><td>Non-Robust</td><th><i>F</i>:</th><td>52.01</td></tr><tr><th></th><td></td><th><i>p</i> (<i>F</i>):</th><td>&lt;0.001*</td></tr></table><table><tr><th></th><th style="text-align:center"><i>β</i></th><th colspan="3" style="text-align:center">(95% CI)</th><th style="text-align:center"><i>p</i></th></tr><tr><th>(Intercept)</th><td>47.48</td><td style="padding-right:0">(38.17</td><td>–</td><td style="padding-left:0">56.78)</td><td style="text-align:left">&lt;0.001*</td></tr><tr><th>SoilPh</th><td>-7.86</td><td style="padding-right:0">(-10.15</td><td>–</td><td style="padding-left:0">-5.57)</td><td style="text-align:left">&lt;0.001*</td></tr></table>'.format(result.fitted_dt)
@pytest.mark.skip('Not implemented in refactored regression implementation')
def test_regress_bootstrap_ols_ol11_4():
"""Compare RegressionModel.bootstrap for Ott & Longnecker (2016) example 11.4/11.7"""
df = pd.DataFrame({
'SoilPh': [3.3, 3.4, 3.4, 3.5, 3.6, 3.6, 3.7, 3.7, 3.8, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 5.0, 5.1, 5.2],
'GrowthRet': [17.78, 21.59, 23.84, 15.13, 23.45, 20.87, 17.78, 20.09, 17.78, 12.46, 14.95, 15.87, 17.45, 14.35, 14.64, 17.25, 12.57, 7.15, 7.50, 4.34]
})
result = yli.regress(sm.OLS, df, 'GrowthRet', 'SoilPh')
np.random.seed(0)
result_bs = result.bootstrap(1000)
assert result_bs.terms['(Intercept)'].beta.point == result.terms['(Intercept)'].beta.point
assert result_bs.terms['(Intercept)'].beta.ci_lower == approx(result.terms['(Intercept)'].beta.ci_lower, rel=0.05)
assert result_bs.terms['(Intercept)'].beta.ci_upper == approx(result.terms['(Intercept)'].beta.ci_upper, rel=0.05)
assert result_bs.terms['SoilPh'].beta.point == result.terms['SoilPh'].beta.point
assert result_bs.terms['SoilPh'].beta.ci_lower == approx(result.terms['SoilPh'].beta.ci_lower, rel=0.1)
assert result_bs.terms['SoilPh'].beta.ci_upper == approx(result.terms['SoilPh'].beta.ci_upper, rel=0.05)
def test_regress_ols_ol13_5():
"""Compare yli.regress for Ott & Longnecker (2016) chapter 13.5"""
df = pd.DataFrame({
'C': [460.05, 452.99, 443.22, 652.32, 642.23, 345.39, 272.37, 317.21, 457.12, 690.19, 350.63, 402.59, 412.18, 495.58, 394.36, 423.32, 712.27, 289.66, 881.24, 490.88, 567.79, 665.99, 621.45, 608.8, 473.64, 697.14, 207.51, 288.48, 284.88, 280.36, 217.38, 270.71],
'D': [68.58, 67.33, 67.33, 68, 68, 67.92, 68.17, 68.42, 68.42, 68.33, 68.58, 68.75, 68.42, 68.92, 68.92, 68.42, 69.5, 68.42, 69.17, 68.92, 68.75, 70.92, 69.67, 70.08, 70.42, 71.08, 67.25, 67.17, 67.83, 67.83, 67.25, 67.83],
'T1': [14, 10, 10, 11, 11, 13, 12, 14, 15, 12, 12, 13, 15, 17, 13, 11, 18, 15, 15, 16, 11, 22, 16, 19, 19, 20, 13, 9, 12, 12, 13, 7],
'T2': [46, 73, 85, 67, 78, 51, 50, 59, 55, 71, 64, 47, 62, 52, 65, 67, 60, 76, 67, 59, 70, 57, 59, 58, 44, 57, 63, 48, 63, 71, 72, 80],
'S': [687, 1065, 1065, 1065, 1065, 514, 822, 457, 822, 792, 560, 790, 530, 1050, 850, 778, 845, 530, 1090, 1050, 913, 828, 786, 821, 538, 1130, 745, 821, 886, 886, 745, 886],
'PR': [0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1],
'NE': [1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'CT': [0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0],
'BW': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1],
'N': [14, 1, 1, 12, 12, 3, 5, 1, 5, 2, 3, 6, 2, 7, 16, 3, 17, 2, 1, 8, 15, 20, 18, 3, 19, 21, 8, 7, 11, 11, 8, 11],
'PT': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
})
df['LNC'] = np.log(df['C'])
result = yli.regress(yli.OLS, df, 'LNC', 'D + T1 + T2 + S + PR + NE + CT + BW + N + PT')
assert result.dof_model == 10
assert result.dof_resid == 21
assert result.f_statistic == approx(13.28, abs=0.01)
assert result.ftest().pvalue < 0.00005
assert result.rsquared == approx(0.8635, abs=0.0001)
assert result.terms['(Intercept)'].beta.point == approx(-10.63398, abs=0.00001)
assert result.terms['(Intercept)'].pvalue == approx(0.0766, abs=0.0001)
assert result.terms['D'].beta.point == approx(0.22760, abs=0.00001)
assert result.terms['D'].pvalue == approx(0.0157, abs=0.0001)
assert result.terms['T1'].beta.point == approx(0.00525, abs=0.00001)
assert result.terms['T1'].pvalue == approx(0.8161, abs=0.0001)
assert result.terms['T2'].beta.point == approx(0.00561, abs=0.00001)
assert result.terms['T2'].pvalue == approx(0.2360, abs=0.0001)
assert result.terms['S'].beta.point == approx(0.00088369, abs=0.00000001)
assert result.terms['S'].pvalue < 0.0001
assert result.terms['PR'].beta.point == approx(-0.10813, abs=0.00001)
assert result.terms['PR'].pvalue == approx(0.2094, abs=0.0001)
assert result.terms['NE'].beta.point == approx(0.25949, abs=0.00001)
assert result.terms['NE'].pvalue == approx(0.0036, abs=0.0001)
assert result.terms['CT'].beta.point == approx(0.11554, abs=0.00001)
assert result.terms['CT'].pvalue == approx(0.1150, abs=0.0001)
assert result.terms['BW'].beta.point == approx(0.03680, abs=0.00001)
assert result.terms['BW'].pvalue == approx(0.7326, abs=0.0001)
assert result.terms['N'].beta.point == approx(-0.01203, abs=0.00001)
assert result.terms['N'].pvalue == approx(0.1394, abs=0.0001)
assert result.terms['PT'].beta.point == approx(-0.22197, abs=0.00001)
assert result.terms['PT'].pvalue == approx(0.1035, abs=0.0001)
def test_regress_logit_ol12_23():
"""Compare yli.regress for Ott & Longnecker (2016) chapter 12.23"""
df = pd.DataFrame({
'Unhealthy': [False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, True, False, True, False, False, False, False, False, True, False, False, True, False, False],
'Fibrinogen': [2.52, 2.46, 2.29, 3.15, 2.88, 2.29, 2.99, 2.38, 2.56, 3.22, 2.35, 3.53, 2.65, 2.15, 3.32, 2.23, 2.19, 2.21, 5.06, 2.68, 2.09, 2.54, 2.18, 2.68, 3.41, 3.15, 3.35, 2.60, 2.28, 3.93, 2.60, 3.34],
'GammaGlobulin': [38, 36, 36, 36, 30, 31, 36, 37, 31, 38, 29, 46, 46, 31, 35, 37, 33, 37, 37, 34, 44, 28, 31, 39, 37, 39, 32, 38, 36, 32, 41, 30]
})
result = yli.regress(yli.Logit, df, 'Unhealthy', 'Fibrinogen + GammaGlobulin')
# Some numerical differences as intercept term is very negative
lrtest_result = result.lrtest_null()
assert lrtest_result.statistic == approx(7.9138, rel=0.01)
assert lrtest_result.dof == 2
assert lrtest_result.pvalue == approx(0.0191, rel=0.02)
# Can't validate intercept as very negative
expbeta_fib = np.exp(result.terms['Fibrinogen'].beta)
assert expbeta_fib.point == approx(6.756, rel=0.01)
assert expbeta_fib.ci_lower == approx(1.007, rel=0.01)
assert expbeta_fib.ci_upper == approx(45.308, rel=0.02)
assert result.terms['Fibrinogen'].pvalue == approx(0.0491, rel=0.01)
expbeta_gam = np.exp(result.terms['GammaGlobulin'].beta)
assert expbeta_gam.point == approx(1.169, abs=0.001)
assert expbeta_gam.ci_lower == approx(0.924, abs=0.001)
assert expbeta_gam.ci_upper == approx(1.477, abs=0.001)
assert result.terms['GammaGlobulin'].pvalue == approx(0.1925, rel=0.01)
expected_summary = ''' Logistic Regression Results
======================================================
Dep. Variable: Unhealthy | No. Observations: 32
Model: Logit | Df. Model: 2
Date: {0:%Y-%m-%d} | Df. Residuals: 29
Time: {0:%H:%M:%S} | Pseudo R²: 0.26
Std. Errors: Non-Robust | LL-Model: -11.47
| LL-Null: -15.44
| p (LR): 0.02*
======================================================
exp(β) (95% CI) p
-----------------------------------------------
(Intercept) 0.00 (0.00 - 0.24) 0.03*
Fibrinogen 6.80 (1.01 - 45.79) 0.049*
GammaGlobulin 1.17 (0.92 - 1.48) 0.19
-----------------------------------------------'''.format(result.fitted_dt)
assert result.summary() == expected_summary
assert result._repr_html_() == '<table><caption>Logistic Regression Results</caption><tr><th>Dep. Variable:</th><td>Unhealthy</td><th>No. Observations:</th><td>32</td></tr><tr><th>Model:</th><td>Logit</td><th>Df. Model:</th><td>2</td></tr><tr><th>Date:</th><td>{0:%Y-%m-%d}</td><th>Df. Residuals:</th><td>29</td></tr><tr><th>Time:</th><td>{0:%H:%M:%S}</td><th>Pseudo <i>R</i><sup>2</sup>:</th><td>0.26</td></tr><tr><th>Std. Errors:</th><td>Non-Robust</td><th>LL-Model:</th><td>-11.47</td></tr><tr><th></th><td></td><th>LL-Null:</th><td>-15.44</td></tr><tr><th></th><td></td><th><i>p</i> (LR):</th><td>0.02*</td></tr></table><table><tr><th></th><th style="text-align:center">exp(<i>β</i>)</th><th colspan="3" style="text-align:center">(95% CI)</th><th style="text-align:center"><i>p</i></th></tr><tr><th>(Intercept)</th><td>0.00</td><td style="padding-right:0">(0.00</td><td>–</td><td style="padding-left:0">0.24)</td><td style="text-align:left"><span style="visibility:hidden">=</span>0.03*</td></tr><tr><th>Fibrinogen</th><td>6.80</td><td style="padding-right:0">(1.01</td><td>–</td><td style="padding-left:0">45.79)</td><td style="text-align:left"><span style="visibility:hidden">=</span>0.049*</td></tr><tr><th>GammaGlobulin</th><td>1.17</td><td style="padding-right:0">(0.92</td><td>–</td><td style="padding-left:0">1.48)</td><td style="text-align:left"><span style="visibility:hidden">=</span>0.19</td></tr></table>'.format(result.fitted_dt)
def test_regress_logit_ol10_18():
"""Compare odds ratios via yli.regress for Ott & Longnecker (2016) example 10.18"""
data = [
(False, False, 250),
(True, False, 750),
(False, True, 400),
(True, True, 1600)
]
df = pd.DataFrame({
'Response': np.repeat([d[0] for d in data], [d[2] for d in data]),
'Stress': np.repeat([d[1] for d in data], [d[2] for d in data])
})
result = yli.regress(yli.Logit, df, 'Stress', 'Response', bool_baselevels=True)
assert isinstance(result.terms['Response'], CategoricalTerm)
assert result.terms['Response'].ref_category == False
expbeta = np.exp(result.terms['Response'].categories['True'].beta)
assert expbeta.point == approx(1.333, abs=0.001)
assert expbeta.ci_lower == approx(1.113, abs=0.001)
assert expbeta.ci_upper == approx(1.596, abs=0.001)
def test_regress_penalisedlogit_kleinman():
"""Compare yli.regress with yli.PenalisedLogit for http://sas-and-r.blogspot.com/2010/11/example-815-firth-logistic-regression.html"""
df = pd.DataFrame({
'Pred': [1] * 20 + [0] * 220,
'Outcome': [1] * 40 + [0] * 200
})
result = yli.regress(yli.PenalisedLogit, df, 'Outcome', 'Pred', exp=False)
assert result.dof_model == 1
assert result.terms['(Intercept)'].beta.point == approx(-2.280389)
assert result.terms['(Intercept)'].beta.ci_lower == approx(-2.765427)
assert result.terms['(Intercept)'].beta.ci_upper == approx(-1.851695)
assert result.terms['(Intercept)'].pvalue < 0.0001
assert result.terms['Pred'].beta.point == approx(5.993961)
assert result.terms['Pred'].beta.ci_lower == approx(3.947048)
assert result.terms['Pred'].beta.ci_upper == approx(10.852893)
assert result.terms['Pred'].pvalue < 0.0001
lrtest_result = result.lrtest_null()
assert lrtest_result.statistic == approx(78.95473)
assert lrtest_result.dof == 1
assert lrtest_result.pvalue < 0.0001
expected_summary = ''' Penalised Logistic Regression Results
============================================================
Dep. Variable: Outcome | No. Observations: 240
Model: Penalised Logit | Df. Model: 1
Date: {0:%Y-%m-%d} | Pseudo R²: 0.37
Time: {0:%H:%M:%S} | LL-Model: -66.43
Std. Errors: Non-Robust | LL-Null: -105.91
| p (LR): <0.001*
============================================================
β (95% CI) p
---------------------------------------------
(Intercept) -2.28 (-2.77 - -1.85) <0.001*
Pred 5.99 (3.95 - 10.85) <0.001*
---------------------------------------------'''.format(result.fitted_dt)
assert result.summary() == expected_summary
assert result._repr_html_() == '<table><caption>Penalised Logistic Regression Results</caption><tr><th>Dep. Variable:</th><td>Outcome</td><th>No. Observations:</th><td>240</td></tr><tr><th>Model:</th><td>Penalised Logit</td><th>Df. Model:</th><td>1</td></tr><tr><th>Date:</th><td>{0:%Y-%m-%d}</td><th>Pseudo <i>R</i><sup>2</sup>:</th><td>0.37</td></tr><tr><th>Time:</th><td>{0:%H:%M:%S}</td><th>LL-Model:</th><td>-66.43</td></tr><tr><th>Std. Errors:</th><td>Non-Robust</td><th>LL-Null:</th><td>-105.91</td></tr><tr><th></th><td></td><th><i>p</i> (LR):</th><td>&lt;0.001*</td></tr></table><table><tr><th></th><th style="text-align:center"><i>β</i></th><th colspan="3" style="text-align:center">(95% CI)</th><th style="text-align:center"><i>p</i></th></tr><tr><th>(Intercept)</th><td>-2.28</td><td style="padding-right:0">(-2.77</td><td>–</td><td style="padding-left:0">-1.85)</td><td style="text-align:left">&lt;0.001*</td></tr><tr><th>Pred</th><td>5.99</td><td style="padding-right:0">(3.95</td><td>–</td><td style="padding-left:0">10.85)</td><td style="text-align:left">&lt;0.001*</td></tr></table>'.format(result.fitted_dt)