2023-02-25 17:15:22 +11:00
|
|
|
# scipy-yli: Helpful SciPy utilities and recipes
|
|
|
|
# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
|
|
|
|
#
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU Affero General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU Affero General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU Affero General Public License
|
|
|
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
|
2023-03-05 02:11:12 +11:00
|
|
|
import numpy as np
|
2023-02-25 17:15:22 +11:00
|
|
|
from scipy import stats
|
|
|
|
import statsmodels.api as sm
|
|
|
|
|
|
|
|
from .config import config
|
2023-02-25 17:23:20 +11:00
|
|
|
from .sig_tests import ChiSquaredResult
|
2023-03-05 02:11:12 +11:00
|
|
|
from .regress import RegressionResult, SingleTerm
|
|
|
|
from .utils import Estimate, check_nan, cols_for_formula, convert_pandas_nullable
|
|
|
|
|
|
|
|
from datetime import datetime
|
|
|
|
import weakref
|
2023-02-25 17:15:22 +11:00
|
|
|
|
2023-03-04 21:51:27 +11:00
|
|
|
def kaplanmeier(df, time, status, by=None, *, ci=True, transform_x=None, transform_y=None, nan_policy='warn'):
|
2023-02-26 00:05:10 +11:00
|
|
|
"""
|
|
|
|
Generate a Kaplan–Meier plot
|
|
|
|
|
|
|
|
Uses the Python *matplotlib* library.
|
|
|
|
|
|
|
|
:param df: Data to generate plot for
|
|
|
|
:type df: DataFrame
|
|
|
|
:param time: Column in *df* for the time to event (numeric or timedelta)
|
|
|
|
:type time: str
|
|
|
|
:param status: Column in *df* for the status variable (True/False or 1/0)
|
|
|
|
:type status: str
|
|
|
|
:param by: Column in *df* to stratify by (categorical)
|
|
|
|
:type by: str
|
|
|
|
:param ci: Whether to plot confidence intervals around the survival function
|
|
|
|
:type ci: bool
|
2023-03-04 21:51:27 +11:00
|
|
|
:param transform_x: Function to transform x axis by
|
|
|
|
:type transform_x: callable
|
|
|
|
:param transform_y: Function to transform y axis by
|
|
|
|
:type transform_y: callable
|
2023-02-26 00:05:10 +11:00
|
|
|
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
|
|
|
|
:type nan_policy: str
|
|
|
|
|
|
|
|
:rtype: (Figure, Axes)
|
|
|
|
"""
|
2023-02-25 17:15:22 +11:00
|
|
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
# Check for/clean NaNs
|
|
|
|
if by:
|
|
|
|
df = check_nan(df[[time, status, by]], nan_policy)
|
|
|
|
else:
|
|
|
|
df = check_nan(df[[time, status]], nan_policy)
|
|
|
|
|
2023-03-04 21:51:27 +11:00
|
|
|
# Covert timedelta to numeric
|
|
|
|
df, time_units = survtime_to_numeric(df, time)
|
2023-02-25 17:15:22 +11:00
|
|
|
|
|
|
|
fig, ax = plt.subplots()
|
|
|
|
|
|
|
|
if by is not None:
|
|
|
|
# Group by independent variable
|
|
|
|
groups = df.groupby(by)
|
|
|
|
|
|
|
|
for group in groups.groups:
|
|
|
|
subset = groups.get_group(group)
|
2023-03-04 21:51:27 +11:00
|
|
|
handle = plot_survfunc_kaplanmeier(ax, subset[time], subset[status], ci, transform_x, transform_y)
|
2023-02-25 17:15:22 +11:00
|
|
|
handle.set_label('{} = {}'.format(by, group))
|
|
|
|
else:
|
|
|
|
# No grouping
|
2023-03-04 21:51:27 +11:00
|
|
|
plot_survfunc_kaplanmeier(ax, df[time], df[status], ci, transform_x, transform_y)
|
2023-02-25 17:15:22 +11:00
|
|
|
|
|
|
|
if time_units:
|
|
|
|
ax.set_xlabel('{} ({})'.format(time, time_units))
|
|
|
|
else:
|
|
|
|
ax.set_xlabel(time)
|
|
|
|
ax.set_ylabel('Survival probability ({:.0%} CI)'.format(1-config.alpha) if ci else 'Survival probability')
|
2023-03-04 21:51:27 +11:00
|
|
|
ax.set_xlim(left=0)
|
2023-02-25 17:15:22 +11:00
|
|
|
ax.set_ylim(0, 1)
|
|
|
|
ax.legend()
|
|
|
|
|
2023-02-26 00:05:10 +11:00
|
|
|
return fig, ax
|
2023-02-25 17:15:22 +11:00
|
|
|
|
2023-03-04 21:51:27 +11:00
|
|
|
def plot_survfunc_kaplanmeier(ax, time, status, ci, transform_x=None, transform_y=None):
|
2023-02-25 17:15:22 +11:00
|
|
|
# Estimate the survival function
|
|
|
|
sf = sm.SurvfuncRight(time, status)
|
|
|
|
|
|
|
|
# Draw straight lines
|
|
|
|
xpoints = sf.surv_times.repeat(2)[1:]
|
|
|
|
ypoints = sf.surv_prob.repeat(2)[:-1]
|
|
|
|
handle = ax.plot(xpoints, ypoints)[0]
|
|
|
|
|
2023-03-04 21:51:27 +11:00
|
|
|
if transform_x:
|
|
|
|
xpoints = transform_x(xpoints)
|
|
|
|
if transform_y:
|
|
|
|
ypoints = transform_y(ypoints)
|
|
|
|
|
2023-02-25 17:15:22 +11:00
|
|
|
if ci:
|
|
|
|
zstar = -stats.norm.ppf(config.alpha/2)
|
|
|
|
|
|
|
|
# Get confidence intervals
|
|
|
|
ci0 = sf.surv_prob - zstar * sf.surv_prob_se
|
|
|
|
ci1 = sf.surv_prob + zstar * sf.surv_prob_se
|
|
|
|
|
|
|
|
# Plot confidence intervals
|
|
|
|
ypoints0 = ci0.repeat(2)[:-1]
|
|
|
|
ypoints1 = ci1.repeat(2)[:-1]
|
|
|
|
|
2023-03-04 21:51:27 +11:00
|
|
|
if transform_y:
|
|
|
|
ypoints0 = transform_y(ypoints0)
|
|
|
|
ypoints1 = transform_y(ypoints1)
|
|
|
|
|
2023-02-25 17:15:22 +11:00
|
|
|
ax.fill_between(xpoints, ypoints0, ypoints1, alpha=0.3, label='_')
|
|
|
|
|
|
|
|
return handle
|
2023-02-25 17:23:20 +11:00
|
|
|
|
2023-03-04 21:51:27 +11:00
|
|
|
def turnbull(df, time_left, time_right, by=None, *, transform_x=None, transform_y=None, nan_policy='warn'):
|
|
|
|
"""
|
|
|
|
Generate a Turnbull estimator plot, which extends the Kaplan–Meier estimator to interval-censored observations
|
|
|
|
|
|
|
|
The intervals are assumed to be half-open intervals, (*left*, *right*]. *right* == *np.inf* implies the event was right-censored. Unlike :func:`yli.kaplanmeier`, times must be given as numeric dtypes and not as pandas timedelta.
|
|
|
|
|
|
|
|
For ease of interpretation, the survival function is drawn as a step function at the midpoint of the estimate on each interval.
|
|
|
|
|
|
|
|
Uses the Python *lifelines* and *matplotlib* libraries.
|
|
|
|
|
|
|
|
:param df: Data to generate plot for
|
|
|
|
:type df: DataFrame
|
|
|
|
:param time_left: Column in *df* for the time to event, left interval endpoint (numeric)
|
|
|
|
:type time_left: str
|
|
|
|
:param time_right: Column in *df* for the time to event, right interval endpoint (numeric)
|
|
|
|
:type time_right: str
|
|
|
|
:param by: Column in *df* to stratify by (categorical)
|
|
|
|
:type by: str
|
|
|
|
:param transform_x: Function to transform x axis by
|
|
|
|
:type transform_x: callable
|
|
|
|
:param transform_y: Function to transform y axis by
|
|
|
|
:type transform_y: callable
|
|
|
|
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
|
|
|
|
:type nan_policy: str
|
|
|
|
|
|
|
|
:rtype: (Figure, Axes)
|
|
|
|
"""
|
|
|
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
# Check for/clean NaNs
|
|
|
|
if by:
|
|
|
|
df = check_nan(df[[time_left, time_right, by]], nan_policy)
|
|
|
|
else:
|
|
|
|
df = check_nan(df[[time_left, time_right]], nan_policy)
|
|
|
|
|
|
|
|
fig, ax = plt.subplots()
|
|
|
|
|
|
|
|
if by is not None:
|
|
|
|
# Group by independent variable
|
|
|
|
groups = df.groupby(by)
|
|
|
|
|
|
|
|
for group in groups.groups:
|
|
|
|
subset = groups.get_group(group)
|
|
|
|
handle = plot_survfunc_turnbull(ax, subset[time_left], subset[time_right], transform_x, transform_y)
|
|
|
|
handle.set_label('{} = {}'.format(by, group))
|
|
|
|
else:
|
|
|
|
# No grouping
|
|
|
|
plot_survfunc_turnbull(ax, df[time_left], df[time_right], transform_x, transform_y)
|
|
|
|
|
|
|
|
ax.set_xlabel('Analysis time')
|
|
|
|
ax.set_ylabel('Survival probability')
|
|
|
|
ax.set_xlim(left=0)
|
|
|
|
ax.set_ylim(0, 1)
|
|
|
|
ax.legend()
|
|
|
|
|
|
|
|
return fig, ax
|
|
|
|
|
|
|
|
def plot_survfunc_turnbull(ax, time_left, time_right, transform_x=None, transform_y=None):
|
|
|
|
import lifelines
|
|
|
|
|
|
|
|
EPSILON = 1e-10
|
|
|
|
|
|
|
|
# TODO: Support left == right => failure was exactly observed
|
|
|
|
|
|
|
|
followup_left = time_left + EPSILON # Add epsilon to make interval half-open
|
|
|
|
followup_right = time_right
|
|
|
|
|
|
|
|
# Estimate the survival function
|
|
|
|
sf = lifelines.KaplanMeierFitter().fit_interval_censoring(followup_left, followup_right)
|
|
|
|
|
|
|
|
# Draw straight lines
|
|
|
|
xpoints = sf.survival_function_.index.to_numpy().repeat(2)[:-1]
|
|
|
|
med = (sf.survival_function_['NPMLE_estimate_upper'] + sf.survival_function_['NPMLE_estimate_lower']) / 2
|
|
|
|
ypoints = med.to_numpy().repeat(2)[1:]
|
|
|
|
|
|
|
|
if transform_x:
|
|
|
|
xpoints = transform_x(xpoints)
|
|
|
|
if transform_y:
|
|
|
|
ypoints = transform_y(ypoints)
|
|
|
|
|
|
|
|
handle = ax.plot(xpoints, ypoints)[0]
|
|
|
|
|
|
|
|
return handle
|
|
|
|
|
|
|
|
def survtime_to_numeric(df, time):
|
|
|
|
"""
|
|
|
|
Convert pandas timedelta dtype to float64, auto-detecting the best time unit to display
|
|
|
|
|
|
|
|
:param df: Data to check for pandas timedelta dtype
|
|
|
|
:type df: DataFrame
|
|
|
|
:param time: Column to check for pandas timedelta dtype
|
|
|
|
:type df: DataFrame
|
|
|
|
|
|
|
|
:return: (*df*, *time_units*)
|
|
|
|
|
|
|
|
* **df** (*DataFrame*) – Data with pandas timedelta dtypes converted, which is *not* copied
|
|
|
|
* **time_units** (*str*) – Human-readable description of the time unit, or *None* if not converted
|
|
|
|
"""
|
|
|
|
|
|
|
|
if df[time].dtype == '<m8[ns]':
|
|
|
|
df[time] = df[time].dt.total_seconds()
|
|
|
|
|
|
|
|
# Auto-detect best time units
|
|
|
|
if df[time].max() > 365.24*24*60*60:
|
|
|
|
df[time] = df[time] / (365.24*24*60*60)
|
|
|
|
time_units = 'years'
|
|
|
|
elif df[time].max() > 7*24*60*60 / 12:
|
|
|
|
df[time] = df[time] / (7*24*60*60)
|
|
|
|
time_units = 'weeks'
|
|
|
|
elif df[time].max() > 24*60*60:
|
|
|
|
df[time] = df[time] / (24*60*60)
|
|
|
|
time_units = 'days'
|
|
|
|
elif df[time].max() > 60*60:
|
|
|
|
df[time] = df[time] / (60*60)
|
|
|
|
time_units = 'hours'
|
|
|
|
elif df[time].max() > 60:
|
|
|
|
df[time] = df[time] / 60
|
|
|
|
time_units = 'minutes'
|
|
|
|
else:
|
|
|
|
time_units = 'seconds'
|
|
|
|
|
|
|
|
return df, time_units
|
|
|
|
else:
|
|
|
|
return df, None
|
|
|
|
|
2023-02-25 17:23:20 +11:00
|
|
|
def logrank(df, time, status, by, nan_policy='warn'):
|
2023-02-26 00:05:10 +11:00
|
|
|
"""
|
|
|
|
Perform the log-rank test for equality of survival functions
|
|
|
|
|
|
|
|
:param df: Data to perform the test on
|
|
|
|
:type df: DataFrame
|
|
|
|
:param time: Column in *df* for the time to event (numeric or timedelta)
|
|
|
|
:type time: str
|
|
|
|
:param status: Column in *df* for the status variable (True/False or 1/0)
|
|
|
|
:type status: str
|
|
|
|
:param by: Column in *df* to stratify by (categorical)
|
|
|
|
:type by: str
|
|
|
|
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
|
|
|
|
:type nan_policy: str
|
|
|
|
|
|
|
|
:rtype: :class:`yli.sig_tests.ChiSquaredResult`
|
|
|
|
"""
|
|
|
|
|
|
|
|
# TODO: Example
|
2023-02-25 17:23:20 +11:00
|
|
|
|
|
|
|
# Check for/clean NaNs
|
|
|
|
df = check_nan(df[[time, status, by]], nan_policy)
|
|
|
|
|
|
|
|
if df[time].dtype == '<m8[ns]':
|
|
|
|
df[time] = df[time].dt.total_seconds()
|
|
|
|
|
|
|
|
statistic, pvalue = sm.duration.survdiff(df[time], df[status], df[by])
|
|
|
|
|
|
|
|
return ChiSquaredResult(statistic=statistic, dof=1, pvalue=pvalue)
|
2023-03-05 02:11:12 +11:00
|
|
|
|
|
|
|
# --------------------------------
|
|
|
|
# Interval-censored Cox regression
|
|
|
|
|
|
|
|
def cox_interval_censored(
|
|
|
|
df, time_left, time_right, formula, *,
|
|
|
|
bootstrap_samples=100,
|
|
|
|
nan_policy='warn',
|
|
|
|
bool_baselevels=False, exp=True,
|
|
|
|
):
|
|
|
|
# TODO: Documentation
|
|
|
|
|
|
|
|
df_ref = weakref.ref(df)
|
|
|
|
|
|
|
|
# Check for/clean NaNs in input columns
|
|
|
|
columns = [time_left, time_right] + cols_for_formula(formula, df)
|
|
|
|
|
|
|
|
df = df[columns]
|
|
|
|
df = check_nan(df, nan_policy)
|
|
|
|
|
|
|
|
# FIXME: Ensure numeric type for dependent variable
|
|
|
|
#df[dep], dep_categories = as_numeric(df[dep])
|
|
|
|
if df[time_left].dtype != 'float64' or df[time_right].dtype != 'float64':
|
|
|
|
raise NotImplementedError('Time dtypes must be float64')
|
|
|
|
|
|
|
|
# Convert pandas nullable types for independent variables
|
|
|
|
df = convert_pandas_nullable(df)
|
|
|
|
|
|
|
|
# ---------
|
|
|
|
# Fit model
|
|
|
|
|
|
|
|
# lifelines.CoxPHFitter doesn't do confidence intervals so we use R
|
|
|
|
|
|
|
|
import rpy2.robjects as ro
|
|
|
|
import rpy2.robjects.packages
|
|
|
|
import rpy2.robjects.pandas2ri
|
|
|
|
|
|
|
|
# Convert bool to int otherwise rpy2 chokes
|
|
|
|
df = df.replace({False: 0, True: 1})
|
|
|
|
|
|
|
|
# Import icenReg
|
|
|
|
ro.packages.importr('icenReg')
|
|
|
|
|
|
|
|
with ro.conversion.localconverter(ro.default_converter + ro.pandas2ri.converter):
|
|
|
|
with ro.local_context() as lc:
|
|
|
|
# Convert DataFrame to R
|
|
|
|
lc['df'] = df
|
|
|
|
|
|
|
|
# Transfer other parameters to R
|
|
|
|
lc['formula_'] = 'Surv({}, {}, type="interval2") ~ {}'.format(time_left, time_right, formula)
|
|
|
|
lc['bootstrap_samples'] = bootstrap_samples
|
|
|
|
|
|
|
|
# FIXME: Seed bootstrap RNG?
|
|
|
|
|
|
|
|
# Fit the model
|
|
|
|
ro.r('model <- ic_sp(as.formula(formula_), data=df, bs_samples=bootstrap_samples)')
|
|
|
|
|
|
|
|
model = ro.r('model')
|
|
|
|
# Hard to access attributes through rpy2
|
|
|
|
term_parameters = ro.r('model$coef')
|
|
|
|
term_names = ro.r('names(model$coef)')
|
|
|
|
term_cis = ro.r('confint(model)')
|
|
|
|
cov_matrix = ro.r('model$var')
|
|
|
|
llf = ro.r('model$llk')[0]
|
|
|
|
|
|
|
|
# TODO: Handle categorical terms?
|
|
|
|
terms = {}
|
|
|
|
for i in range(len(term_parameters)):
|
|
|
|
# These values not directly exposed so we must calculate them
|
|
|
|
se = np.sqrt(cov_matrix[i, i])
|
|
|
|
pvalue = 2 * stats.norm(loc=0, scale=se).cdf(-np.abs(term_parameters[i]))
|
|
|
|
|
|
|
|
term = SingleTerm(term_names[i], Estimate(term_parameters[i], term_cis[i][0], term_cis[i][1]), pvalue)
|
|
|
|
terms[term_names[i]] = term
|
|
|
|
|
|
|
|
result = RegressionResult(
|
|
|
|
None, df_ref, '({}, {}]'.format(time_left, time_right), formula, nan_policy, None, None,
|
|
|
|
model,
|
|
|
|
'Interval-Censored Cox Regression', 'CoxIC', 'MLE',
|
|
|
|
len(df), None, None, datetime.now(), 'Bootstrap',
|
|
|
|
terms,
|
|
|
|
llf, None,
|
|
|
|
None, None, None,
|
|
|
|
[],
|
|
|
|
exp
|
|
|
|
)
|
|
|
|
|
|
|
|
return result
|