Skip to content

Instantly share code, notes, and snippets.

@vincentarelbundock
Last active December 9, 2023 02:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vincentarelbundock/4313644338740a0238933e16124607f9 to your computer and use it in GitHub Desktop.
Save vincentarelbundock/4313644338740a0238933e16124607f9 to your computer and use it in GitHub Desktop.
Maringe et al. 2020
# Downloads data here:
# https://academic.oup.com/ije/article/49/5/1719/5835351#226179197
#############################################################################
# TRIAL EMULATION: SURGERY WITHIN 6 MONTHS AMONG OLDER LUNG CANCER PATIENTS
# Author: Clemence Leyrat (from Camille Maringe's Stata code)
# Refactored by Vincent Arel-Bundock on 2023-12-08
# Content: recoding the outcome and follow-up time for our emulated trial
# creating the censoring indicators and time uncensored
#############################################################################
# Packages
library(survival) # For survival analysis
library(tinytest)
library(boot)
set.seed(1024)
tab <- read.csv("ije-2019-08-1035-File008.csv", sep = ",", header = T)
###############################################################################################
# VARIABLES
# id: patient identifier
# fup_obs: observed follow-up time (time to death or 1 year if censored alive)
# death: observed event of interest (all-cause death) 1: dead, 0:alive
# timetosurgery: time to surgery (NA if no surgery)
# surgery: observed treatment 1 if the patient received surgery within 6 month, 0 otherwise
# age: age at diagnosis
# sex: patient's sex
# perf: performance status at diagnosis
# stage: stage at diagnosis
# deprivation: deprivation score
# charlson: Charlson's comorbidity index
# emergency: route to diagnosis
# Dataset: these variables are in a dataframe called "tab" (unavailable)
###############################################################################################
########################################################################
# STEP 1-CLONING: CREATION OF OUTCOME AND FOLLOW-UP TIME IN EACH ARM
########################################################################
# Note: Letters A, B... refer to the scenarios identified in our graph
# We will create two new variables:
# - fup: the follow-up time in the emulated trial (which can be different from the observed follow-up time)
# - outcome: the outcome in the emulated trial (which can be different from the observed follow-up time)
# These variables will be the outcome and follow up time in the outcome model
#########################################################################################################################################
clone <- function(tab, arm = "Control") {
# Create a copy of the dataset for the 'Control' arm
out <- tab
out$arm <- arm
# Case 1: Patients receive surgery within 6 months
idx <- which(out$surgery == 1 & out$timetosurgery <= 182.62)
out$outcome[idx] <- ifelse(out$arm[idx] == "Control", 0, out$death[idx])
out$fup[idx] <- ifelse(out$arm[idx] == "Control", out$timetosurgery[idx], out$fup_obs[idx])
# Case 2: Patients do not receive surgery within 6 months
if (arm == "Control") {
idx <- which(out$surgery == 0 | (out$surgery == 1 & out$timetosurgery > 182.62))
out$outcome[idx] <- out$death[idx]
out$fup[idx] <- out$fup_obs[idx]
# Case 2: Patients die or are lost to follow-up before 6 months without having surgery
} else if (arm == "Surgery") {
idx <- which(out$surgery == 0 & out$fup_obs <= 182.62)
out$outcome[idx] <- out$death[idx]
out$fup[idx] <- out$fup_obs[idx]
}
# Case 3: Patients do not receive surgery within 6 months and are still alive or at risk at 6 months
if (arm == "Surgery") {
idx <- which((out$surgery == 0 & out$fup_obs > 182.62) | (out$surgery == 1 & out$timetosurgery > 182.62))
out$outcome[idx] <- 0
out$fup[idx] <- 182.62
}
# Censoring status and follow-up time uncensored
# Case 1
idx <- out$surgery == 1 & out$timetosurgery <= 182.62
out$censoring[idx] <- ifelse(arm == "Control", 1, 0)
out$fup_uncensored[idx] <- out$timetosurgery[idx]
# Case 2
idx <- out$surgery == 0 & out$fup_obs <= 182.62
out$censoring[idx] <- 0
out$fup_uncensored[idx] <- out$fup_obs[idx]
# Case 3
idx <- (out$surgery == 0 & out$fup_obs > 182.62) | (out$surgery == 1 & out$timetosurgery > 182.62)
out$censoring[idx] <- ifelse(arm == "Surgery", 1, 0)
out$fup_uncensored[idx] <- 182.62
return(out)
}
tab <- rbind(
clone(tab, "Control"),
clone(tab, "Surgery")
)
# Bootstrap function
fboot <- function(tab, indices) {
t <- tab[tab$arm == "Control", ]
t1 <- tab[tab$arm == "Surgery", ]
tab0 <- t[indices, ] # allows boot to select sample
select <- tab0$id
tab1 <- t1[t1$id %in% select, ]
tab <- rbind(tab0, tab1)
####################################################
# STEP 2-SPLITTING THE DATASET AT EACH TIME OF EVENT
####################################################
# Dataframe containing the time of events and an ID for the times of events
t_events <- sort(unique(tab$fup))
times <- data.frame("tevent" = t_events, "ID_t" = seq(1:length(t_events)))
process_data <- function(tab, t_events, times, arm = "Surgery") {
# Filtering the dataset for the 'Surgery' arm
tab_s <- tab[tab$arm == arm, ]
# Creation of the entry variable (Tstart, 0 for everyone)
tab_s$Tstart <- 0
# Splitting the dataset at each time of event until the event happens and sorting it
data.long <- survSplit(tab_s,
cut = t_events, end = "fup",
start = "Tstart", event = "outcome", id = "ID")
data.long <- data.long[order(data.long$ID, data.long$fup), ]
# Splitting the original dataset at each time of event and sorting it until censoring happens
data.long.cens <- survSplit(tab_s,
cut = t_events, end = "fup",
start = "Tstart", event = "censoring", id = "ID")
data.long.cens <- data.long.cens[order(data.long.cens$ID, data.long.cens$fup), ]
# Replacing the censoring variable in data.long by the censoring variable obtained in the second split dataset
data.long$censoring <- data.long.cens$censoring
# Creating Tstop (end of the interval)
data.long$Tstop <- data.long$fup
# Merge with 'times' data and sort
data.long <- merge(data.long, times, by.x = "Tstart", by.y = "tevent", all.x = TRUE)
data.long <- data.long[order(data.long$ID, data.long$fup), ]
data.long$ID_t[is.na(data.long$ID_t)] <- 0
return(data.long)
}
data <- rbind(
process_data(tab, t_events, times, arm = "Surgery"),
process_data(tab, t_events, times, arm = "Control")
)
data_final <- merge(data, times, by = "ID_t", all.x = T)
data_final <- data_final[order(data_final$ID, data_final$fup), ]
############################################
# STEP 3- ESTIMATING THE CENSORING WEIGHTS
############################################
censoring_weights <- function(data_final, arm = "Surgery") {
data.long <- data_final[which(data_final$arm == arm), ]
# STEP 1: censoring model
# Cox model
ms_cens <- coxph(Surv(Tstart, Tstop, censoring) ~ age + sex + emergency +
stage + deprivation + charlson + perf, ties = "efron", data = data.long)
data.long$risk <- predict(
ms_cens,
newdata = data.long,
type = "risk",
reference = "zero")
# Estimating the cumulative hazard (when covariates=0)
dat.base <- data.frame(basehaz(ms_cens, centered = F))
names(dat.base) <- c("hazard", "t")
dat.base <- unique(merge(dat.base, times, by.x = "t", by.y = "tevent", all.x = T))
# Merging and reordering the dataset
data.long <- merge(data.long, dat.base, by = "ID_t", all.x = T)
data.long <- data.long[order(data.long$id, data.long$fup), ]
data.long$hazard[is.na(data.long$hazard)] <- 0
# Estimating the probability of remaining uncensored at each time of event
data.long$P_uncens <- exp(-(data.long$hazard) * data.long$risk)
# IPC Weights are the inverse of the probability of remaining uncensored
data.long$weight_Cox <- 1 / data.long$P_uncens
return(data.long)
}
data.long.Cox <- rbind(
censoring_weights(data_final, arm = "Surgery"),
censoring_weights(data_final, arm = "Control")
)
############################################
# STEP 3- ESTIMATING THE SURVIVOR FUNCTION
############################################
##################################################
# Emulated trial with Cox weights (Kaplan-Meier)
##################################################
emul_Cox_s <- survfit(Surv(Tstart, Tstop, outcome) ~ 1,
data = data.long.Cox[data.long.Cox$arm == "Surgery", ], weights = weight_Cox)
S1 <- min(emul_Cox_s$surv) # 1 year survival in the surgery group
fit.tableM <- summary(emul_Cox_s, rmean = 365)$table
RMST1 <- fit.tableM["*rmean"] # Estimated RMST in the surgery group
emul_Cox_c <- survfit(Surv(Tstart, Tstop, outcome) ~ 1,
data = data.long.Cox[data.long.Cox$arm == "Control", ], weights = weight_Cox)
S0 <- min(emul_Cox_c$surv) # 1 year survival in the control group
fit.tableM2 <- summary(emul_Cox_c, rmean = 365)$table
RMST0 <- fit.tableM2["*rmean"] # Estimated RMST in the control group
Diff_surv <- S1 - S0 # Difference in 1 year survival
Diff_RMST <- RMST1 - RMST0 # Difference in RMST
##################################################
# Emulated trial with Cox weights (Cox model)
##################################################
Cox_w <- coxph(Surv(Tstart, Tstop, outcome) ~ arm,
data = data.long.Cox, weights = weight_Cox)
HR <- exp(Cox_w$coefficients) # Hazard ratio
res <- c(Diff_surv, Diff_RMST, HR)
names(res) <- c("Diff_surv", "Diff_RMST", "HR")
return(res)
}
##############################################
# CALLING THE BOOTSTRAP FUNCTION
##############################################
# Bootstrapping with 1000 replications
results <- boot(data = tab, statistic = fboot, R = 100)
expect_equal(unname(results$t0[1]), .12, tolerance = .01) |> print()
expect_equal(unname(results$t0[3]), 0.6072863, tolerance = .01) |> print()
# 95% confidence intervals for each measure
# boot.ci(results,type="norm",index=1) #Difference in survival
# boot.ci(results,type="norm",index=2) #Difference in RMST
# boot.ci(results,type="norm",index=3) #HR
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment