scipy-yli/yli/survival.py

378 lines
12 KiB
Python
Raw Normal View History

2023-02-25 17:15:22 +11:00
# scipy-yli: Helpful SciPy utilities and recipes
# Copyright © 2022–2023 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
2023-03-05 02:11:12 +11:00
import numpy as np
import pandas as pd
2023-02-25 17:15:22 +11:00
from scipy import stats
import statsmodels.api as sm
import io
import json
import subprocess
2023-02-25 17:15:22 +11:00
from .config import config
2023-02-25 17:23:20 +11:00
from .sig_tests import ChiSquaredResult
2023-04-16 21:56:09 +10:00
from .utils import Estimate, check_nan
2023-02-25 17:15:22 +11:00
def kaplanmeier(df, time, status, by=None, *, ci=True, transform_x=None, transform_y=None, nan_policy='warn', fig=None, ax=None):
"""
Generate a KaplanMeier plot
Uses the Python *matplotlib* library.
:param df: Data to generate plot for
:type df: DataFrame
:param time: Column in *df* for the time to event (numeric or timedelta)
:type time: str
:param status: Column in *df* for the status variable (True/False or 1/0)
:type status: str
:param by: Column in *df* to stratify by (categorical)
:type by: str
:param ci: Whether to plot confidence intervals around the survival function
:type ci: bool
2023-03-04 21:51:27 +11:00
:param transform_x: Function to transform x axis by
:type transform_x: callable
:param transform_y: Function to transform y axis by
:type transform_y: callable
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
:rtype: (Figure, Axes)
"""
2023-02-25 17:15:22 +11:00
import matplotlib.pyplot as plt
# Check for/clean NaNs
if by:
df = check_nan(df[[time, status, by]], nan_policy)
else:
df = check_nan(df[[time, status]], nan_policy)
2023-03-04 21:51:27 +11:00
# Covert timedelta to numeric
df, time_units = survtime_to_numeric(df, time)
2023-02-25 17:15:22 +11:00
if ax is None:
fig, ax = plt.subplots()
2023-02-25 17:15:22 +11:00
if by is not None:
# Group by independent variable
groups = df.groupby(by)
for group in groups.groups:
subset = groups.get_group(group)
2023-03-04 21:51:27 +11:00
handle = plot_survfunc_kaplanmeier(ax, subset[time], subset[status], ci, transform_x, transform_y)
2023-02-25 17:15:22 +11:00
handle.set_label('{} = {}'.format(by, group))
else:
# No grouping
2023-03-04 21:51:27 +11:00
plot_survfunc_kaplanmeier(ax, df[time], df[status], ci, transform_x, transform_y)
2023-02-25 17:15:22 +11:00
if time_units:
ax.set_xlabel('{} ({})'.format(time, time_units))
else:
ax.set_xlabel(time)
ax.set_ylabel('Survival probability ({:.0%} CI)'.format(1-config.alpha) if ci else 'Survival probability')
2023-03-04 21:51:27 +11:00
ax.set_xlim(left=0)
2023-02-25 17:15:22 +11:00
ax.set_ylim(0, 1)
if by is not None:
ax.legend()
2023-02-25 17:15:22 +11:00
return fig, ax
2023-02-25 17:15:22 +11:00
2023-03-04 21:51:27 +11:00
def plot_survfunc_kaplanmeier(ax, time, status, ci, transform_x=None, transform_y=None):
2023-04-22 01:18:02 +10:00
xpoints, ypoints, ypoints0, ypoints1 = calc_survfunc_kaplanmeier(time, status, ci, transform_x, transform_y)
handle = ax.plot(xpoints, ypoints)[0]
if ci:
ax.fill_between(xpoints, ypoints0, ypoints1, alpha=0.3, label='_')
return handle
def calc_survfunc_kaplanmeier(time, status, ci, transform_x=None, transform_y=None):
2023-02-25 17:15:22 +11:00
# Estimate the survival function
sf = sm.SurvfuncRight(time, status)
# Draw straight lines
# np.concatenate(...) to force starting drawing from time 0, survival 100%
xpoints = np.concatenate([[0], sf.surv_times]).repeat(2)[1:]
ypoints = np.concatenate([[1], sf.surv_prob]).repeat(2)[:-1]
2023-02-25 17:15:22 +11:00
2023-03-04 21:51:27 +11:00
if transform_x:
xpoints = transform_x(xpoints)
if transform_y:
ypoints = transform_y(ypoints)
2023-02-25 17:15:22 +11:00
if ci:
zstar = -stats.norm.ppf(config.alpha/2)
# Get confidence intervals
ci0 = sf.surv_prob - zstar * sf.surv_prob_se
ci1 = sf.surv_prob + zstar * sf.surv_prob_se
# Plot confidence intervals
ypoints0 = np.concatenate([[1], ci0]).repeat(2)[:-1]
ypoints1 = np.concatenate([[1], ci1]).repeat(2)[:-1]
2023-02-25 17:15:22 +11:00
2023-03-04 21:51:27 +11:00
if transform_y:
ypoints0 = transform_y(ypoints0)
ypoints1 = transform_y(ypoints1)
2023-04-22 01:18:02 +10:00
return xpoints, ypoints, ypoints0, ypoints1
2023-02-25 17:15:22 +11:00
2023-04-22 01:18:02 +10:00
return xpoints, ypoints, None, None
2023-02-25 17:23:20 +11:00
def turnbull(df, time_left, time_right, by=None, *, ci=True, step_loc=0.5, maxiter=None, fail_prob_tolerance=None, se_method=None, zero_tolerance=None, transform_x=None, transform_y=None, nan_policy='warn', fig=None, ax=None):
2023-03-04 21:51:27 +11:00
"""
Generate a Turnbull estimator plot, which extends the KaplanMeier estimator to interval-censored observations
The intervals are assumed to be half-open intervals, (*left*, *right*]. *right* == *np.inf* implies the event was right-censored.
2023-03-04 21:51:27 +11:00
By default, the survival function is drawn as a step function at the midpoint of each Turnbull interval.
2023-03-04 21:51:27 +11:00
Uses the hpstat *turnbull* command.
2023-03-04 21:51:27 +11:00
:param df: Data to generate plot for
:type df: DataFrame
:param time_left: Column in *df* for the time to event, left interval endpoint (numeric or timedelta)
2023-03-04 21:51:27 +11:00
:type time_left: str
:param time_right: Column in *df* for the time to event, right interval endpoint (numeric or timedelta)
2023-03-04 21:51:27 +11:00
:type time_right: str
:param by: Column in *df* to stratify by (categorical)
:type by: str
:param ci: Whether to plot confidence intervals around the survival function
:type ci: bool
:param step_loc: Proportion along the length of each Turnbull interval to step down the survival function, e.g. 0 for left bound, 1 for right bound, 0.5 for interval midpoint
:type step_loc: float
:param maxiter: Maximum number of iterations to attempt
:type maxiter: int
:param fail_prob_tolerance: Terminate algorithm when the absolute change in failure probability in each interval is less than this tolerance
:type fail_prob_tolerance: float
:param se_method: Method for computing standard error or survival probabilities (see hpstat *turnbull* documentation)
:type se_method: str
:param zero_tolerance: Threshold for dropping failure probability when se_method is "oim-drop-zeros"
:type zero_tolerance: float
2023-03-04 21:51:27 +11:00
:param transform_x: Function to transform x axis by
:type transform_x: callable
:param transform_y: Function to transform y axis by
:type transform_y: callable
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
:rtype: (Figure, Axes)
"""
import matplotlib.pyplot as plt
# Check for/clean NaNs
if by:
df = check_nan(df[[time_left, time_right, by]], nan_policy)
else:
df = check_nan(df[[time_left, time_right]], nan_policy)
# Covert timedelta to numeric
df, time_units = survtime_to_numeric(df, time_left, time_right)
if ax is None:
fig, ax = plt.subplots()
2023-03-04 21:51:27 +11:00
if by is not None:
# Group by independent variable
groups = df.groupby(by)
for group in groups.groups:
subset = groups.get_group(group)
handle = plot_survfunc_turnbull(ax, subset[time_left], subset[time_right], ci, step_loc, maxiter, fail_prob_tolerance, se_method, zero_tolerance, transform_x, transform_y)
2023-03-04 21:51:27 +11:00
handle.set_label('{} = {}'.format(by, group))
else:
# No grouping
plot_survfunc_turnbull(ax, df[time_left], df[time_right], ci, step_loc, maxiter, fail_prob_tolerance, se_method, zero_tolerance, transform_x, transform_y)
2023-03-04 21:51:27 +11:00
if time_units:
ax.set_xlabel('{} + {} ({})'.format(time_left, time_right, time_units))
else:
ax.set_xlabel('{} + {}'.format(time_left, time_right))
2023-03-04 21:51:27 +11:00
ax.set_ylabel('Survival probability')
ax.set_xlim(left=0)
ax.set_ylim(0, 1)
if by is not None:
ax.legend()
2023-03-04 21:51:27 +11:00
return fig, ax
def plot_survfunc_turnbull(ax, time_left, time_right, ci, step_loc=0.5, maxiter=None, fail_prob_tolerance=None, se_method=None, zero_tolerance=None, transform_x=None, transform_y=None):
xpoints, ypoints, ypoints0, ypoints1 = calc_survfunc_turnbull(time_left, time_right, ci, step_loc, maxiter, fail_prob_tolerance, se_method, zero_tolerance, transform_x, transform_y)
handle = ax.plot(xpoints, ypoints)[0]
if ci:
ax.fill_between(xpoints, ypoints0, ypoints1, alpha=0.3, label='_')
return handle
def calc_survfunc_turnbull(time_left, time_right, ci, step_loc=0.5, maxiter=None, fail_prob_tolerance=None, se_method=None, zero_tolerance=None, transform_x=None, transform_y=None):
# Estimate the survival function
2023-03-04 21:51:27 +11:00
# Prepare arguments
hpstat_args = [config.hpstat_path, 'turnbull', '-', '--output', 'json']
if maxiter:
hpstat_args.append('--max-iterations')
hpstat_args.append(str(maxiter))
if fail_prob_tolerance:
hpstat_args.append('--fail-prob-tolerance')
hpstat_args.append(str(fail_prob_tolerance))
if se_method:
hpstat_args.append('--se-method')
hpstat_args.append(se_method)
if zero_tolerance:
hpstat_args.append('--zero-tolerance')
hpstat_args.append(str(zero_tolerance))
2023-03-04 21:51:27 +11:00
# Export data to CSV
csv_buf = io.StringIO()
pd.DataFrame({'LeftTime': time_left, 'RightTime': time_right}).to_csv(csv_buf, index=False)
csv_str = csv_buf.getvalue()
2023-03-04 21:51:27 +11:00
# Run hpstat binary
proc = subprocess.run(hpstat_args, input=csv_str, stdout=subprocess.PIPE, stderr=None, encoding='utf-8', check=True)
raw_result = json.loads(proc.stdout)
2023-03-04 21:51:27 +11:00
survival_prob = np.array(raw_result['survival_prob'])
from IPython.display import clear_output
clear_output(wait=True)
xpoints = [i[0]*(1-step_loc) + i[1]*step_loc for i in raw_result['failure_intervals'] if i[1]]
ypoints = survival_prob
2023-03-04 21:51:27 +11:00
# Draw straight lines
# np.concatenate(...) to force starting drawing from time 0, survival 100%
xpoints = np.concatenate([[0], xpoints]).repeat(2)[1:]
ypoints = np.concatenate([[1], ypoints]).repeat(2)[:-1]
2023-03-04 21:51:27 +11:00
if transform_x:
xpoints = transform_x(xpoints)
if transform_y:
ypoints = transform_y(ypoints)
if ci:
zstar = -stats.norm.ppf(config.alpha/2)
survival_prob_se = np.array(raw_result['survival_prob_se'])
# Get confidence intervals
ci0 = survival_prob - zstar * survival_prob_se
ci1 = survival_prob + zstar * survival_prob_se
# Plot confidence intervals
ypoints0 = np.concatenate([[1], ci0]).repeat(2)[:-1]
ypoints1 = np.concatenate([[1], ci1]).repeat(2)[:-1]
if transform_y:
ypoints0 = transform_y(ypoints0)
ypoints1 = transform_y(ypoints1)
return xpoints, ypoints, ypoints0, ypoints1
return xpoints, ypoints, None, None
2023-03-04 21:51:27 +11:00
def survtime_to_numeric(df, time, time2=None):
2023-03-04 21:51:27 +11:00
"""
Convert pandas timedelta dtype to float64, auto-detecting the best time unit to display
:param df: Data to check for pandas timedelta dtype
:type df: DataFrame
:param time: Column to check for pandas timedelta dtype
:type df: DataFrame
:param time: Second column, if any, to check for pandas timedelta dtype
:type df: DataFrame
2023-03-04 21:51:27 +11:00
:return: (*df*, *time_units*)
* **df** (*DataFrame*) Data with pandas timedelta dtypes converted, which is *not* copied
* **time_units** (*str*) Human-readable description of the time unit, or *None* if not converted
"""
max_time = None
2023-03-04 21:51:27 +11:00
if df[time].dtype == '<m8[ns]':
df[time] = df[time].dt.total_seconds()
max_time = df[time].max()
if df[time2].dtype == '<m8[ns]':
df[time2] = df[time2].dt.total_seconds()
max_time = max(max_time or 0, df[time2].max())
if max_time is not None:
2023-03-04 21:51:27 +11:00
# Auto-detect best time units
if max_time > 365.24*24*60*60:
time_divider = 365.24*24*60*60
2023-03-04 21:51:27 +11:00
time_units = 'years'
elif max_time > 7*24*60*60 / 12:
time_divider = 7*24*60*60
2023-03-04 21:51:27 +11:00
time_units = 'weeks'
elif max_time > 24*60*60:
time_divider = 24*60*60
2023-03-04 21:51:27 +11:00
time_units = 'days'
elif max_time > 60*60:
time_divider = 60*60
2023-03-04 21:51:27 +11:00
time_units = 'hours'
elif max_time > 60:
time_divider = 60
2023-03-04 21:51:27 +11:00
time_units = 'minutes'
else:
time_divider = 1
2023-03-04 21:51:27 +11:00
time_units = 'seconds'
df[time] /= time_divider
if time2:
df[time2] /= time_divider
2023-03-04 21:51:27 +11:00
return df, time_units
else:
return df, None
2023-02-25 17:23:20 +11:00
def logrank(df, time, status, by, nan_policy='warn'):
"""
Perform the log-rank test for equality of survival functions
:param df: Data to perform the test on
:type df: DataFrame
:param time: Column in *df* for the time to event (numeric or timedelta)
:type time: str
:param status: Column in *df* for the status variable (True/False or 1/0)
:type status: str
:param by: Column in *df* to stratify by (categorical)
:type by: str
:param nan_policy: How to handle *nan* values (see :ref:`nan-handling`)
:type nan_policy: str
:rtype: :class:`yli.sig_tests.ChiSquaredResult`
"""
# TODO: Example
2023-02-25 17:23:20 +11:00
# Check for/clean NaNs
df = check_nan(df[[time, status, by]], nan_policy)
if df[time].dtype == '<m8[ns]':
df[time] = df[time].dt.total_seconds()
statistic, pvalue = sm.duration.survdiff(df[time], df[status], df[by])
return ChiSquaredResult(statistic=statistic, dof=1, pvalue=pvalue)