Chapter 14: Fitting the Cox regression model • alda

library(alda)
library(dplyr)
library(tidyr)
library(purrr)
library(ggplot2)
library(patchwork)
library(survival)
library(muhaz)
library(car)
library(broom)
library(modelsummary)
library(gt)

14.1 Toward a statistical model for continuous-time hazard

In Chapter 14 Singer and Willett (2003) develop and explain the Cox regression model (often labelled the proportional hazards model) using data from Henning and Frueh (1996), who measured days to rearrest in a sample of 194 inmates recently released from a medium security prison. Inmates were followed for up to three years or until they were rearrested.

For this example we use the rearrest data set, a person-level data frame with 194 rows and 7 columns:

id: Participant ID.
days: Number of days to rearrest.
months: Number of months to rearrest, on the scale of an “average” month (with 30.4375 days).
censor: Censoring status.
person_crime: Committed a person-related crime
property_crime: Binary indicator for whether the inmate committed a property crime.
age: Centred age at time of release.

rearrest
#> # A tibble: 194 × 7
#>    id     days months censor person_crime property_crime    age
#>    <fct> <dbl>  <dbl>  <dbl>        <dbl>          <dbl>  <dbl>
#>  1 1         2 0.0657      0            1              1  -1.68
#>  2 2         4 0.131       0            0              1 -10.5 
#>  3 3         7 0.230       0            1              1  -4.43
#>  4 4         9 0.296       0            0              1 -11.3 
#>  5 5         9 0.296       0            1              1  -7.16
#>  6 6        10 0.329       0            1              0  -2.87
#>  7 7        15 0.493       0            0              1  -6.43
#>  8 8        19 0.624       1            1              1  22.5 
#>  9 9        21 0.690       0            1              1 -12.5 
#> 10 10       22 0.723       0            1              1  -9.41
#> # ℹ 184 more rows

To inform specification of the Cox regression model we will fit in subsequent sections, we begin with some basic exploration and description of the rearrest data using the descriptive methods introduced in Chapter 13. Following Singer and Willett (2003), our research question centres on the effect of the person_crime predictor.

Plots of within-group sample functions

Here we will plot within-group sample functions of survival, cumulative hazard, and kernel-smoothed hazard estimates for the person_crime predictor. Notice that former inmates who committed a person-related crime are consistently at higher risk of rearrest than those who committed other types of crime.

# Estimate survival and cumulative hazard functions ---------------------------
rearrest_survfit <- survfit(Surv(months, 1 - censor) ~ 1, data = rearrest)
person_crime_0_survfit <- update(rearrest_survfit, subset = person_crime == 0)
person_crime_1_survfit <- update(rearrest_survfit, subset = person_crime == 1)

person_crime_survfits <- list(
  person_crime_0 = person_crime_0_survfit,
  person_crime_1 = person_crime_1_survfit
)

person_crime_lifetables <- map(
  person_crime_survfits,
  \(.x) {
    .x |>
      survfit0() |>
      tidy() |>
      mutate(cumhaz.estimate = -log(estimate)) |>
      select(time, surv.estimate = estimate, cumhaz.estimate)
  }
)

# Estimate kernel-smoothed hazard functions -----------------------------------
person_crime_smoothhaz <- map2(
  list(
    person_crime_0 = filter(rearrest, person_crime == 0)$months,
    person_crime_1 = filter(rearrest, person_crime == 1)$months
  ),
  list(
    1 - filter(rearrest, person_crime == 0)$censor,
    1 - filter(rearrest, person_crime == 1)$censor
  ),
  \(.time, .event) {
    kernel_smoothed_hazard <- muhaz(
      .time,
      .event,
      min.time = min(.time[1 - .event == 0]) + 8,
      max.time = max(.time[1 - .event == 0]) - 8,
      bw.grid = 8,
      bw.method = "global",
      b.cor = "none",
      kern = "epanechnikov"
    )
    
    kernel_smoothed_hazard |>
      tidy() |>
      rename(smoothhaz.estimate = estimate)
  }
)

# Join estimates --------------------------------------------------------------
person_crime_functions <- person_crime_lifetables |>
  map2(
    person_crime_smoothhaz,
    \(.x, .y) {
      .x |>
        full_join(.y) |>
        arrange(time) |>
        rename(months = time)
    }
  ) |>
  list_rbind(names_to = "person_crime") |>
  mutate(person_crime = person_crime == "person_crime_1")

# Plot ------------------------------------------------------------------------
person_crime_surv <- person_crime_functions |>
  select(months, surv.estimate, person_crime) |>
  na.omit() |>
  ggplot(aes(x = months, y = surv.estimate, colour = person_crime)) +
    geom_step() +
    scale_colour_brewer(palette = "Dark2") +
    coord_cartesian(xlim = c(0, 36), ylim = c(0, 1))

person_crime_cumhaz <- person_crime_functions |>
  select(months, cumhaz.estimate, person_crime) |>
  na.omit() |>
  ggplot(aes(x = months, y = cumhaz.estimate, colour = person_crime)) +
    geom_step() +
    scale_colour_brewer(palette = "Dark2") +
    coord_cartesian(xlim = c(0, 36), ylim = c(0, 1.5))

person_crime_smoothhaz <- person_crime_functions |>
  select(months, smoothhaz.estimate, person_crime) |>
  na.omit() |>
  ggplot(aes(x = months, y = smoothhaz.estimate, colour = person_crime)) +
    geom_line() +
    scale_colour_brewer(palette = "Dark2") +
    coord_cartesian(xlim = c(0, 36), ylim = c(0, .08))

# Figure 14.1, page 505:
person_crime_surv / person_crime_cumhaz / person_crime_smoothhaz +
  plot_layout(guides = "collect", axes = "collect")

What kind of statistical model do these plots suggest?

In specifying a suitable statistical model to represent the relationship between the population continuous-time hazard function and predictors, Singer and Willett (2003) begin by developing a cumulative hazard formulation of the Cox regression model, which has two complications we must deal with.

First, the model must respect the semi-bounded nature of the cumulative hazard, precluding impossible values below the 0. This complication can be dealt with using a logarithmic transformation on the cumulative hazard function, yielding the unbounded the log cumulative hazard function.

person_crime_lifetable <- person_crime_lifetables |>
  list_rbind(names_to = "person_crime") |>
  mutate(
    logcumhaz.estimate = log(cumhaz.estimate),
    person_crime = person_crime == "person_crime_1"
  ) |>
  rename(months = time)

# Figure 14.2, page 508 (top):
person_crime_lifetable |>
  filter(!is.infinite(logcumhaz.estimate)) |>
  ggplot(aes(x = months, y = logcumhaz.estimate, colour = person_crime)) +
    geom_hline(yintercept = 0, alpha = .25) +
    geom_step() +
    scale_colour_brewer(palette = "Dark2") +
    coord_cartesian(xlim = c(0, 36), ylim = c(-6, 1))

Second, the model must describe the shape of the entire log cumulative hazard function over time. Similar to discrete time, this complication can be dealt with by expressing the entire log cumulative hazard function as the sum of a baseline function and a weighted linear combination of predictors; however, unlike the discrete-time hazard model, we do not specify its shape, only that it has some shape and is a continuous function.

Following the same logic when we developed discrete-time hazard models in Chapter 11, we can express a Cox regression model for the rearrest data as:

$\log H(t_{ij}) = \log H_0(t_{j}) + \beta_1 \text{person_crime}_i,$

where $\log H_0(t_{j})$ represents the completely general baseline log cumulative hazard function.

This model can also be expressed in terms of (raw) cumulative hazard by antilogging both sides of the equation:

$H(t_{ij}) = H_0(t_{j}) e^{(\beta_1 \text{person_crime}_i)},$

Note that that this transformation converts the vertical distances between the (log) cumulative hazard functions from a constant absolute amount, $\beta_1$ , to a constant relative amount, $e^{(\beta_1)}$ .

A hazard function representation for the Cox model

Thanks to mathematical identities, the Cox model takes on an identical form when expressed in terms of hazard directly, substituting the outcome (and baseline) hazard function, $h(t_{j})$ , for the outcome (and baseline) cumulative hazard function, $H(t_{j})$ :

$\log h(t_{ij}) = \log h_0(t_{j}) + \beta_1 \text{person_crime}_i.$

Antilogging both sides, we can also write this model as:

$h(t_{ij}) = h_0(t_{j}) e^{(\beta_1 \text{person_crime}_i)}.$ These representations allow us to articulate the assumptions of the Cox model using the intuitively appealing metric of hazard, instead of cumulative hazard. Singer and Willett (2003) identify three assumptions highlighted in these representations that are inherent to basic Cox regression models:

There is a postulated log hazard function for each value of the predictor (or for each combination of predictor levels for models with more than one predictor).
The postulated log hazard functions have an identical shape for all predictor values. We do not place any constraints on the specification of that shape, and the function can take on any form necessary to adequately describe the distribution of event occurrence in the population.
The postulated log hazard functions have an identical distance between them at every possible instant.

14.2 Fitting the Cox regression model to data

In Section 14.2 Singer and Willett (2003) discuss how the Cox regression model is fit to data using partial maximum likelihood estimation, which has three practical consequences for data analysis:

The shape of the baseline hazard function is irrelevant: Partial maximum likelihood estimation invokes a conditioning argument that eliminates the baseline hazard function entirely from consideration. The consequence of this is that fitted models do not provide provide predicted values of hazard, and nonparametric strategies must be used to “recover” survivor and cumulative hazard functions.
The precise event times are irrelevant: Only the rank order of event times matter. The consequence of this is that neither the exact timing of events nor their metric for clocking time influence model results, so long as the relative ranking of event times is preserved.
Ties can create analytic difficulties: Each tied observation’s contribution to the partial likelihood must be evaluated, with the required number of additional calculations increasing as a function of the the number of people tied at any specific event time. The consequence of this is that ties can be computationally expensive, and a method for dealing with them must be selected before model fitting. Singer and Willett (2003) suggest using the exact method whenever computationally feasible, and Efron’s approximation whenever not. For data with many ties, discrete-time methods are likely more appropriate.

For mathematical details of partial maximum likelihood estimation, consult the textbook.

14.3 Interpreting the results of fitting the Cox regression model to data

In Section 14.3 Singer and Willett (2003) discuss how to interpret parameter estimates, evaluate goodness-of-fit, test statistical hypotheses, and summarize the effects of several predictors simultaneously using risk scores for the Cox regression model:

Interpreting parameter estimates: Each regression coefficient describes the effect of a one-unit difference in the associated predictor on either log hazard (raw coefficients) or raw hazard (antilogged coefficients). Antilogged coefficients are hazard ratios that allow us to make comparative statements about hazard; because the Cox regression model does not estimate the baseline hazard function, we cannot make absolute statements. Similar to odds ratios, hazard ratios can be expressed as percentages, and the reference group for a hazard ratio can be swapped by taking its reciprocal.
Evaluating goodness-of-fit: As usual, likelihood ratio tests can be used to compare the −2LL statistics between null or nested models, and AIC/BIC statistics between non-nested models (in addition to using a combination of logic, theory, and prior research).
Testing statistical hypotheses: Statistical hypotheses about the effects of predictors can be tested by comparing −2LL statistics for nested models (recommended) or Wald tests (not recommended). Asymptotic confidence intervals can be constructed using asymptotic standard errors associated with Wald tests.
Using risk scores: The simultaneous effects of several predictors can be summarized using risk scores, which are model-based predictions that compare each individual’s risk of event occurrence relative to that of a “baseline individual” who has value 0 for every predictor in the model. Individuals who face higher relative risk will have scores greater than 1, those who face lower relative risk will have scores less than 1, and those who face no higher or lesser relative risk will have scores of approximately 1.

Here we will fit four Cox regression models to the rearrest data using the coxph() function from the survival package: A model that includes person_crime as a predictor (Model A), a model that includes property_crime as a predictor (Model B), a model that includes age as a predictor (Model C), and a model that includes all three of these predictors (Model D).

# Fit models ------------------------------------------------------------------
rearrest_fit_A <- coxph(
  Surv(months, 1 - censor) ~ person_crime, data = rearrest
)
rearrest_fit_B <- update(rearrest_fit_A, . ~ property_crime)
rearrest_fit_C <- update(rearrest_fit_A, . ~ age)
rearrest_fit_D <- update(rearrest_fit_A, . ~ person_crime + property_crime + age)

rearrest_fits <- list(
  "Model A" = rearrest_fit_A,
  "Model B" = rearrest_fit_B,
  "Model C" = rearrest_fit_C,
  "Model D" = rearrest_fit_D
)

# Test hypotheses -------------------------------------------------------------
rearrest_fits_htests <- list(LR = "LR", Wald = "Wald") |>
  map(
    \(.test) {
      rearrest_fits |>
        map(
          \(.fit) {
            .fit |>
              Anova(type = 2, test.statistic = .test) |>
              tidy() |>
              filter(term != "NULL") |>
              mutate(term = paste0(term, " = 0")) |>
              select(term, statistic)
          }
        ) |>
        list_rbind(names_to = "model") |>
        pivot_wider(names_from = model, values_from = statistic)
    }
  ) |>
  list_rbind()

# Make table ------------------------------------------------------------------
options(modelsummary_get = "all")

# Table 14.1, page 525:
rearrest_fits |>
  # {modelsummary} doesn't have a built-in way to include raw and exponentiated
  # coefficients in a table at the same time, so we will use the
  # `modelsummary_list` approach to customize model output. See:
  # https://modelsummary.com/vignettes/modelsummary_extension.html
  map(
    \(.fit) {
      ms_list <- list(
        tidy = rbind(
          tidy(.fit),
          mutate(
            tidy(.fit, exponentiate = TRUE), term = paste0("exp(", term, ")")
          )
        ),
        glance = glance(.fit)
      )
      class(ms_list) <- "modelsummary_list"
      ms_list
    }
  ) |>
  modelsummary(
    fmt = 4,
    coef_map = c(
      "person_crime",
      "property_crime",
      "age",
      "exp(person_crime)",
      "exp(property_crime)",
      "exp(age)"
    ),
    gof_map = list(
      list(
        raw = "logLik",
        clean = "-2LL",
        fmt = \(.x) vec_fmt_number(
          -2*as.numeric(.x), decimals = 2, sep_mark = ""
        )
      ),
      list(
        raw = "statistic.log",
        clean = "LR statistic",
        fmt = fmt_decimal(2)
      ),
      list(
        raw = "p.value.log",
        clean = "P",
        fmt = fmt_decimal(pdigits = 4)
      ),
      list(
        raw = "AIC",
        clean = "AIC",
        fmt = fmt_decimal(2)
      ),
      list(
        raw = "BIC",
        clean = "BIC",
        fmt = fmt_decimal(2)
      )
    ),
    add_rows = rearrest_fits_htests,
    output = "gt",
  ) |>
  tab_row_group(label = "Wald Hypothesis Tests", rows = 21:23) |>
  tab_row_group(label = "Likelihood-Ratio Hypothesis Tests", rows = 18:20) |>
  tab_row_group(label = "Goodness-of-Fit", rows = 13:17) |>
  tab_row_group(label = "Hazard Ratios", rows = 7:12) |>
  tab_row_group(label = "Parameter Estimates", rows = 1:6)

	Model A	Model B	Model C	Model D
Parameter Estimates
person_crime	0.4787			0.5691
	(0.2025)			(0.2052)
property_crime		1.1946		0.9358
		(0.3493)		(0.3509)
age			-0.0681	-0.0667
			(0.0156)	(0.0168)
Hazard Ratios
exp(person_crime)	1.6140			1.7667
	(0.2025)			(0.2052)
exp(property_crime)		3.3022		2.5492
		(0.3493)		(0.3509)
exp(age)			0.9341	0.9355
			(0.0156)	(0.0168)
Goodness-of-Fit
-2LL	984.08	973.20	966.43	950.44
LR statistic	5.32	16.20	22.97	38.96
P	0.021	0.000	0.000	0.000
AIC	986.08	975.20	968.43	956.44
BIC	988.74	977.86	971.10	964.43
Likelihood-Ratio Hypothesis Tests
person_crime = 0	5.3237			7.2760
property_crime = 0		16.2050		9.1493
age = 0			22.9688	18.3234
Wald Hypothesis Tests
person_crime = 0	5.5894			7.6920
property_crime = 0		11.6970		7.1129
age = 0			19.0014	15.8094

The augment() function from the broom package can be used to get the predicted risk scores for each individual from the fitted Cox regression model by setting the type.predict argument to "risk". Following Singer and Willett (2003), here we display predictions for a subset of individuals with risk scores of varying size.

# Filter by ID order to match the textbook table.
rearrest_subset <- c(22, 8, 187, 26, 5, 130, 106, 33) |>
  map(\(.id) filter(rearrest, id == .id)) |>
  list_rbind()

# Table 14.2, page 533:
rearrest_fit_D |>
  augment(newdata = rearrest_subset, type.predict = "risk") |>
  rename(risk_score = .fitted) |>
  select(-.se.fit) |>
  relocate(days:censor, .after = risk_score) |>
  gt() |>
  fmt_number(columns = age, decimals = 3) |>
  fmt_number(columns = risk_score, decimals = 2) |>
  fmt_number(columns = months, decimals = 4)

id	person_crime	property_crime	age	risk_score	days	months	censor
22	0	0	0.258	0.98	52	1.7084	1
8	1	1	22.451	1.01	19	0.6242	1
187	1	0	−7.200	2.86	1095	36.0000	1
26	0	1	−7.301	4.15	72	2.3655	0
5	1	1	−7.165	7.26	9	0.2957	0
130	0	1	22.391	0.57	486	15.9671	1
106	0	0	16.203	0.34	356	11.6961	0
33	1	0	27.061	0.29	85	2.7926	1

14.4 Nonparametric strategies for displaying the results of model fitting

In Section 14.4 Singer and Willett (2003) discuss nonparametric methods for recovering the baseline cumulative hazard function from a fitted Cox regression model, allowing for the plotting of survival and cumulative hazard functions for both “baseline” and prototypical individuals.

The baseline cumulative hazard function can be recovered from a fitted Cox regression model using the survfit() function from the survival package. Here we will plot recovered baseline survival and cumulative hazard functions from two Cox regression models: a model whose “baseline” individual is a former inmate of average age upon release who had no history of either property or personal crime (Model D), and a model whose “baseline” individual is a former inmate of average age upon release who had an “average” history of both property and personal crime (Model E).

# Fit model -------------------------------------------------------------------
rearrest_fit_E <- update(
  rearrest_fit_A,
  . ~ scale(person_crime, scale = FALSE)
    + scale(property_crime, scale = FALSE)
    + age
)

rearrest_fits_2 <- list(
  "Model D" = rearrest_fit_D,
  "Model E" = rearrest_fit_E
)

# Make plots ------------------------------------------------------------------

# Figure 14.4, page 538:
rearrest_fits_2 |>
  map(
    \(.fit) {
      rearrest_survfit <- .fit |>
        survfit() |>
        survfit0() |>
        tidy() |>
        # Filter out censored tail rows for display purposes.
        filter(
          row_number() <= row_number()[estimate == min(estimate) & n.event > 0]
        ) |>
        mutate(cumhaz.estimate = -log(estimate)) |>
        rename(surv.estimate = estimate)
      
      rearrest_surv <- ggplot(rearrest_survfit, aes(x = time, y = surv.estimate)) +
        geom_line() +
        geom_point(size = .5) +
        scale_x_continuous(breaks = seq(0, 36, 6)) +
        coord_cartesian(xlim = c(0, 36), ylim = c(0, 1))
      
      rearrest_cumhaz <- rearrest_surv +
        aes(y = cumhaz.estimate) +
        coord_cartesian(xlim = c(0, 36), ylim = c(0, 1.5))
      
      rearrest_surv / rearrest_cumhaz +
        plot_layout(axes = "collect")
    }
  ) |>
  wrap_plots()

Finally, fitted survival and cumulative hazard functions at selected predictor values can be obtained using the newdata argument of the survfit() function. Here we will plot fitted functions for all possible combinations of person_crime and property_crime, with age set to its sample mean.

# Fit functions ---------------------------------------------------------------
rearrest_ptypes <- list(
  neither       = tibble(person_crime = 0, property_crime = 0, age = 0),
  personal_only = tibble(person_crime = 1, property_crime = 0, age = 0),
  property_only = tibble(person_crime = 0, property_crime = 1, age = 0),
  both          = tibble(person_crime = 1, property_crime = 1, age = 0)
)

rearrest_ptype_funs <- rearrest_ptypes |>
  map(
    \(.newdata) {
      rearrest_fit_D |>
        survfit(newdata = .newdata) |>
        survfit0() |>
        tidy() |>
        # Filter out censored tail rows for display purposes.
        filter(
          row_number() <= row_number()[estimate == min(estimate) & n.event > 0]
        ) |>
        mutate(
          cumhaz.estimate = -log(estimate),
          logcumhaz.estimate = log(cumhaz.estimate)
        ) |>
        rename(surv.estimate = estimate)
    }
  ) |>
  list_rbind(names_to = "crime")

# Make plots ------------------------------------------------------------------
rearrest_ptype_surv <- ggplot(rearrest_ptype_funs) +
  aes(x = time, y = surv.estimate, colour = crime) +
  geom_line() +
  scale_x_continuous(breaks = seq(0, 36, 6)) +
  scale_color_brewer(palette = "Dark2") +
  coord_cartesian(xlim = c(0, 36), ylim = c(0, 1))

rearrest_ptype_cumhaz <- rearrest_ptype_surv +
  aes(y = cumhaz.estimate) +
  coord_cartesian(xlim = c(0, 36), ylim = c(0, 2))

rearrest_ptype_logcumhaz <- rearrest_ptype_surv +
  aes(y = logcumhaz.estimate) +
  scale_y_continuous(breaks = -7:1) +
  coord_cartesian(xlim = c(0, 36), ylim = c(-7, 1))

# Figure 14.5, page 541:
rearrest_ptype_surv / rearrest_ptype_cumhaz / rearrest_ptype_logcumhaz +
  plot_layout(guides = "collect", axes = "collect")