Update intcox documentation

This commit is contained in:
RunasSudo 2023-04-29 18:33:58 +10:00
parent a1bb1568ad
commit 5876f724ad
Signed by: RunasSudo
GPG Key ID: 7234E476BF21C61A

View File

@ -13,12 +13,16 @@
\usepackage[hidelinks]{hyperref}
\usepackage{mathtools}
\newcommand{\bbeta}{\kern -0.1em\symbf{β}}
\newcommand{\blambda}{\kern -0.1em\symbf{Λ}}
\newcommand{\nablasub}[1]{\nabla_{\kern -0.15em #1}}
\begin{document}
{\centering\bfseries Supplemental documentation for hpstat \textit{intcox} command\par}
The hpstat \textit{intcox} command implements Cox proportional hazards regression for interval-censored observations, using an iterative convex minorant algorithm. The general algorithm is proposed by Huang \& Wellner [1]. When computing the baseline hazard, we apply a damped iterative convex minorant algorithm described by Aragón \& Eberly [2] and Pan [3]. This documentation discusses technical details of the implementation.
Let the baseline cumulative hazard be a step function $Λ(t)$ with steps at times $\{t_1, t_2, …, t_m\}$. We seek to optimise for the vector $\symbf{Λ} = (Λ(t_1), Λ(t_2), …, Λ(t_m))^\mathrm{T}$. The cumulative hazard $Λ(t; \symbf{Z}_i)$ given covariates $\symbf{Z}_i$ is $\exp(\symbf{Z}_i^\mathrm{T} \symbf{β}) Λ(t)$. The cumulative distribution function for failure times is $F(t; \symbf{Z}_i) = 1 - \exp\left(-\exp(\symbf{Z}_i^\mathrm{T} \symbf{β}) Λ(t)\right)$.
Let the baseline cumulative hazard be a step function $Λ(t)$ with steps at times $\{t_1, t_2, …, t_m\}$. We seek to optimise for the vector $\blambda = (Λ(t_1), Λ(t_2), …, Λ(t_m))^\mathrm{T}$. The cumulative hazard $Λ(t; \symbf{Z}_i)$ given covariates $\symbf{Z}_i$ is $\exp(\symbf{Z}_i^\mathrm{T} \bbeta) Λ(t)$. The cumulative distribution function for failure times is $F(t; \symbf{Z}_i) = 1 - \exp\left(-\exp(\symbf{Z}_i^\mathrm{T} \bbeta) Λ(t)\right)$.
Take the $i$-th observation, whose failure time falls in $(L_i, R_i]$.
@ -26,61 +30,61 @@
%
\begin{align*}
\mathcal{L}_i &= \log( F(R_i; \symbf{Z}_i) - F(L_i; \symbf{Z}_i) ) \\
&= \log\left( \exp\left(-\exp(\symbf{Z}_i^\mathrm{T} \symbf{β}) Λ(L_i)\right) - \exp\left(-\exp(\symbf{Z}_i^\mathrm{T} \symbf{β}) Λ(R_i)\right) \right) \\
\intertext{Let $S_{Li} = \exp\left(-\exp(\symbf{Z}_i^\mathrm{T} \symbf{β}) Λ(L_i)\right)$, and $S_{Ri} = \exp\left(-\exp(\symbf{Z}_i^\mathrm{T} \symbf{β}) Λ(R_i)\right)$, so:}
&= \log\left( \exp\left(-\exp(\symbf{Z}_i^\mathrm{T} \bbeta) Λ(L_i)\right) - \exp\left(-\exp(\symbf{Z}_i^\mathrm{T} \bbeta) Λ(R_i)\right) \right) \\
\intertext{Let $S_{Li} = \exp\left(-\exp(\symbf{Z}_i^\mathrm{T} \bbeta) Λ(L_i)\right)$, and $S_{Ri} = \exp\left(-\exp(\symbf{Z}_i^\mathrm{T} \bbeta) Λ(R_i)\right)$, so:}
\mathcal{L}_i &= \log\left( S_{Li} - S_{Ri} \right)
\end{align*}
%
Note the gradient $\nabla_{\symbf{Λ}} S_{Li}$ with respect to $\symbf{Λ}$ is the vector whose $j$-th element is:
Note the gradient $\nablasub{\blambda} S_{Li}$ with respect to $\blambda$ is the vector whose $j$-th element is:
%
$$\frac{\partial S_{Li}}{\partial Λ(t_j)} = - S_{Li} \exp(\symbf{Z}_i^\mathrm{T} \symbf{β}) \mathrm{I}(t_j = R_i)$$
$$\frac{\partial S_{Li}}{\partial Λ(t_j)} = - S_{Li} \exp(\symbf{Z}_i^\mathrm{T} \bbeta) \mathrm{I}(t_j = R_i)$$
%
$\nabla_{\symbf{Λ}} S_{Ri}$ is derived similarly, and so the gradient $\nabla_{\symbf{Λ}} \mathcal{L}_i$ is the vector whose $j$-th element is:
$\nablasub{\blambda} S_{Ri}$ is derived similarly, and so the gradient $\nablasub{\blambda} \mathcal{L}_i$ is the vector whose $j$-th element is:
%
$$\frac{\partial}{\partial Λ(t_j)} \log\left( S_{Li} - S_{Ri} \right) = \frac{A_{ij}}{S_{Li} - S_{Ri}}$$
%
Where $A_{ij} = \partial\left( S_{Li} - S_{Ri} \right) / \partial Λ(t_j) = S_{Ri} \exp(\symbf{Z}_i^\mathrm{T} \symbf{β}) \mathrm{I}(t_j = R_i) - S_{Li} \exp(\symbf{Z}_i^\mathrm{T} \symbf{β}) \mathrm{I}(t_j = L_i)$. The sum of all $\nabla_{\symbf{Λ}} \mathcal{L}_i$ yields $\nabla_{\symbf{Λ}} \mathcal{L}$.
Where $A_{ij} = \partial\left( S_{Li} - S_{Ri} \right) / \partial Λ(t_j) = S_{Ri} \exp(\symbf{Z}_i^\mathrm{T} \bbeta) \mathrm{I}(t_j = R_i) - S_{Li} \exp(\symbf{Z}_i^\mathrm{T} \bbeta) \mathrm{I}(t_j = L_i)$. The sum of all $\nablasub{\blambda} \mathcal{L}_i$ yields $\nablasub{\blambda} \mathcal{L}$.
Note that $\partial A_{ij} / \partial Λ(t_j) = -A_{ij} \exp(\symbf{Z}_i^\mathrm{T} \symbf{β})$, so applying the quotient rule and simplifying, the Hessian $\nabla^2_{\symbf{Λ}} \mathcal{L}_i$ has diagonal $(j, j)$-th elements:
Note that $\partial A_{ij} / \partial Λ(t_j) = -A_{ij} \exp(\symbf{Z}_i^\mathrm{T} \bbeta)$, so applying the quotient rule and simplifying, the Hessian $\nabla^2_{\blambda} \mathcal{L}_i$ has diagonal $(j, j)$-th elements:
%
$$\frac{\partial^2}{\partial Λ(t_j){}^2} \log\left( S_{Li} - S_{Ri} \right) = \frac{-A_{ij} \exp(\symbf{Z}_i^\mathrm{T} \symbf{β})}{S_{Li} - S_{Ri}} - \left( \frac{A_{ij}}{S_{Li} - S_{Ri}} \right)^2$$
$$\frac{\partial^2}{\partial Λ(t_j){}^2} \log\left( S_{Li} - S_{Ri} \right) = \frac{-A_{ij} \exp(\symbf{Z}_i^\mathrm{T} \bbeta)}{S_{Li} - S_{Ri}} - \left( \frac{A_{ij}}{S_{Li} - S_{Ri}} \right)^2$$
%
And the sum of all $\nabla^2_{\symbf{Λ}} \mathcal{L}_i$ yields $\nabla^2_{\symbf{Λ}} \mathcal{L}$.
And the sum of all $\nabla^2_{\blambda} \mathcal{L}_i$ yields $\nabla^2_{\blambda} \mathcal{L}$.
Let $G$ be a diagonal matrix of the diagonal elements of $-\nabla^2_{\symbf{Λ}} \mathcal{L}$. As discussed by Pan [3], we update $\symbf{Λ}$ by iteratively applying:
Let $\symbf{G}$ be a diagonal matrix of the diagonal elements of $-\nabla^2_{\blambda} \mathcal{L}$. As discussed by Pan [3], we update $\blambda$ by iteratively applying:
%
$$\symbf{Λ} \leftarrow \mathrm{Proj}[ \symbf{Λ} + α G^{-1} \nabla_{\symbf{Λ}} \mathcal{L}, G, \mathbb{R} ]$$
$$\blambda \leftarrow \mathrm{Proj}[ \blambda + α \symbf{G}^{-1} \nablasub{\blambda} \mathcal{L}, G, \mathbb{R} ]$$
%
Where $\mathrm{Proj}$, as defined by Pan [3], essentially represents monotonic (isotonic) regression. We port an efficient pool adjacent violators algorithm (PAVA) implementation from scikit-learn.
Turning to the estimation of $\symbf{β}$, note the gradient $\nabla_{\symbf{β}} S_{Li}$ with respect to $\symbf{β}$ is:
Turning to the estimation of $\bbeta$, note the gradient $\nablasub{\bbeta} S_{Li}$ with respect to $\bbeta$ is:
%
$$\nabla_{\symbf{β}} S_{Li} = \frac{\partial S_{Li}}{\partial\symbf{β}} = -S_{Li} \exp(\symbf{Z}_i^\mathrm{T} \symbf{β}) Λ(L_i) \symbf{Z}_i$$
$$\nablasub{\bbeta} S_{Li} = \frac{\partial S_{Li}}{\partial\bbeta} = -S_{Li} \exp(\symbf{Z}_i^\mathrm{T} \bbeta) Λ(L_i) \symbf{Z}_i$$
%
$\nabla_{\symbf{β}} S_{Ri}$ is derived similarly. The gradient $\nabla_{\symbf{β}} \mathcal{L}_i$ is:
$\nablasub{\bbeta} S_{Ri}$ is derived similarly. The gradient $\nablasub{\bbeta} \mathcal{L}_i$ is:
%
$$\nabla_{\symbf{β}} \mathcal{L}_i = \left[ \frac{B_{Ri} - B_{Li}}{S_{Li} - S_{Ri}} \right] \symbf{Z}_i$$
$$\nablasub{\bbeta} \mathcal{L}_i = \left[ \frac{B_{Ri} - B_{Li}}{S_{Li} - S_{Ri}} \right] \symbf{Z}_i$$
%
Where $B_{Li} = S_{Li} \exp(\symbf{Z}_i^\mathrm{T} \symbf{β}) Λ(L_i)$ and $B_{Ri} = S_{Ri} \exp(\symbf{Z}_i^\mathrm{T} \symbf{β}) Λ(R_i)$.
Where $B_{Li} = S_{Li} \exp(\symbf{Z}_i^\mathrm{T} \bbeta) Λ(L_i)$ and $B_{Ri} = S_{Ri} \exp(\symbf{Z}_i^\mathrm{T} \bbeta) Λ(R_i)$.
Note $\partial B_{Li} / \partial \symbf{β} = \exp(\symbf{Z}_i^\mathrm{T} \symbf{β}) Λ(L_i) (S_{Li} - B_{Li}) \symbf{Z}_i$, and $\partial B_{Ri} / \partial \symbf{β}$ is analogous. Applying the quotient rule and simplifying, the Hessian $\nabla^2_{\symbf{Λ}} \mathcal{L}_i$ is:
Note $\partial B_{Li} / \partial \bbeta = \exp(\symbf{Z}_i^\mathrm{T} \bbeta) Λ(L_i) (S_{Li} - B_{Li}) \symbf{Z}_i$, and $\partial B_{Ri} / \partial \bbeta$ is analogous. Applying the quotient rule and simplifying, the Hessian $\nabla^2_{\blambda} \mathcal{L}_i$ is:
%
$$\nabla^2_{\symbf{β}} \mathcal{L}_i = \left[ \frac{\exp(\symbf{Z}_i^\mathrm{T} \symbf{β}) Λ(R_i) (S_{Ri} - B_{Ri}) - \exp(\symbf{Z}_i^\mathrm{T} \symbf{β}) Λ(L_i) (S_{Li} - B_{Li})}{S_{Li} - S_{Ri}} - \left( \frac{B_{Ri} - B_{Li}}{S_{Li} - S_{Ri}} \right)^2 \right] \symbf{Z}_i \symbf{Z}_i^\mathrm{T}$$
$$\nabla^2_{\bbeta} \mathcal{L}_i = \left[ \frac{\exp(\symbf{Z}_i^\mathrm{T} \bbeta) Λ(R_i) (S_{Ri} - B_{Ri}) - \exp(\symbf{Z}_i^\mathrm{T} \bbeta) Λ(L_i) (S_{Li} - B_{Li})}{S_{Li} - S_{Ri}} - \left( \frac{B_{Ri} - B_{Li}}{S_{Li} - S_{Ri}} \right)^2 \right] \symbf{Z}_i \symbf{Z}_i^\mathrm{T}$$
%
And the sum of all $\nabla^2_{\symbf{β}} \mathcal{L}_i$ yields $\nabla^2_{\symbf{β}} \mathcal{L}$.
And the sum of all $\nabla^2_{\bbeta} \mathcal{L}_i$ yields $\nabla^2_{\bbeta} \mathcal{L}$.
As discussed by Huang \& Wellner [1], we now apply standard Newton–Raphson optimisation by iteratively applying:
%
$$\symbf{β} \leftarrow \symbf{β} + [-\nabla^2_{\symbf{β}} \mathcal{L}]^{-1} \nabla_{\symbf{β}} \mathcal{L}$$
$$\bbeta \leftarrow \bbeta + [-\nabla^2_{\bbeta} \mathcal{L}]^{-1} \nablasub{\bbeta} \mathcal{L}$$
%
We alternate between updating $\symbf{Λ}$ and updating $\symbf{β}$ until convergence is reached.
We alternate between updating $\blambda$ and updating $\bbeta$ until convergence is reached.
The covariance matrix for $\symbf{β}$ is computed as suggested by Zeng, Gao \& Lin [4] as the inverse of the empirical Fisher information, which is in turn estimated as the outer product of the gradient of the profile log-likelihood.
The covariance matrix for $\bbeta$ is computed as suggested by Zeng, Gao \& Lin [4] as the inverse of the empirical Fisher information, which is in turn estimated as the outer product of the gradient of the profile log-likelihood.
{\vspace{0.5cm}\scshape\centering References\par}
\begin{enumerate}
\item Huang J, Wellner JA. Interval censored survival data: a review of recent progress. In: Bickel P, et al., editors. \textit{Proceedings of the First Seattle Symposium in Biostatistics: survival analysis}; 1995 Nov 20–21; University of Washington, Seattle. New York: Springer-Verlag; 1997. p. 123–69. \href{https://doi.org/10.1007/978-1-4684-6316-3_8}{doi: 10.1007\slash 978-1-4684-6316-3\_8}
\item Huang J, Wellner JA. Interval censored survival data: a review of recent progress. In: Lin DY, Fleming TR, editors. \textit{Proceedings of the First Seattle Symposium in Biostatistics: survival analysis}; 1995 Nov 20–21; University of Washington, Seattle. New York: Springer-Verlag; 1997. p. 123–69. \href{https://doi.org/10.1007/978-1-4684-6316-3_8}{doi: 10.1007\slash 978-1-4684-6316-3\_8}
\item Aragón J, Eberly D. On convergence of convex minorant algorithms for distribution estimation with interval-censored data. \textit{Journal of Computational and Graphical Statistics}. 1992;1(2):129–40. \href{https://doi.org/10.2307/1390837}{doi: 10.2307\slash 1390837}
\item Pan W. Extending the iterative convex minorant algorithm to the Cox model for interval-censored data. \textit{Journal of Computational and Graphical Statistics}. 1999;8(1):109–20. \href{https://doi.org/10.1080/10618600.1999.10474804}{doi: 10.1080\slash 10618600.\allowbreak{}1999.10474804}
\item Zeng D, Gao F, Lin DY. Maximum likelihood estimation for semiparametric regression models with multivariate interval-censored data. \textit{Biometrika}. 2017;104(3):505–25. \href{https://doi.org/10.1093/biomet/asx029}{doi: 10.\allowbreak{}1093\slash biomet\slash asx029}