vincentarelbundock/maringe_2020.R

## maringe_2020.R
# Downloads data here:

# https://academic.oup.com/ije/article/49/5/1719/5835351#226179197


#############################################################################
# TRIAL EMULATION: SURGERY WITHIN 6 MONTHS AMONG OLDER LUNG CANCER PATIENTS

# Author: Clemence Leyrat (from Camille Maringe's Stata code)
# Refactored by Vincent Arel-Bundock on 2023-12-08
# Content: recoding the outcome and follow-up time for our emulated trial
#           creating the censoring indicators and time uncensored

#############################################################################


# Packages
library(survival) # For survival analysis
library(tinytest)
library(boot)
set.seed(1024)

tab <- read.csv("ije-2019-08-1035-File008.csv", sep = ",", header = T)

###############################################################################################
# VARIABLES

# id: patient identifier
# fup_obs: observed follow-up time (time to death or 1 year if censored alive)
# death: observed event of interest (all-cause death) 1: dead, 0:alive
# timetosurgery: time to surgery (NA if no surgery)
# surgery: observed treatment 1 if the patient received surgery within 6 month, 0 otherwise
# age: age at diagnosis
# sex: patient's sex
# perf: performance status at diagnosis
# stage: stage at diagnosis
# deprivation: deprivation score
# charlson: Charlson's comorbidity index
# emergency: route to diagnosis

# Dataset: these variables are in a dataframe called "tab" (unavailable)

###############################################################################################


########################################################################
# STEP 1-CLONING: CREATION OF OUTCOME AND FOLLOW-UP TIME IN EACH ARM
########################################################################

# Note: Letters A, B... refer to the scenarios identified in our graph
# We will create two new variables:
# - fup: the follow-up time in the emulated trial (which can be different from the observed follow-up time)
# - outcome: the outcome in the emulated trial (which can be different from the observed follow-up time)

# These variables will be the outcome and follow up time in the outcome model
#########################################################################################################################################

clone <- function(tab, arm = "Control") {
  # Create a copy of the dataset for the 'Control' arm
  out <- tab
  out$arm <- arm

  # Case 1: Patients receive surgery within 6 months
  idx <- which(out$surgery == 1 & out$timetosurgery <= 182.62)
  out$outcome[idx] <- ifelse(out$arm[idx] == "Control", 0, out$death[idx])
  out$fup[idx] <- ifelse(out$arm[idx] == "Control", out$timetosurgery[idx], out$fup_obs[idx])

  # Case 2: Patients do not receive surgery within 6 months
  if (arm == "Control") {
    idx <- which(out$surgery == 0 | (out$surgery == 1 & out$timetosurgery > 182.62))
    out$outcome[idx] <- out$death[idx]
    out$fup[idx] <- out$fup_obs[idx]

    # Case 2: Patients die or are lost to follow-up before 6 months without having surgery
  } else if (arm == "Surgery") {
    idx <- which(out$surgery == 0 & out$fup_obs <= 182.62)
    out$outcome[idx] <- out$death[idx]
    out$fup[idx] <- out$fup_obs[idx]
  }

  # Case 3: Patients do not receive surgery within 6 months and are still alive or at risk at 6 months
  if (arm == "Surgery") {
    idx <- which((out$surgery == 0 & out$fup_obs > 182.62) | (out$surgery == 1 & out$timetosurgery > 182.62))
    out$outcome[idx] <- 0
    out$fup[idx] <- 182.62
  }

  # Censoring status and follow-up time uncensored

  # Case 1
  idx <- out$surgery == 1 & out$timetosurgery <= 182.62
  out$censoring[idx] <- ifelse(arm == "Control", 1, 0)
  out$fup_uncensored[idx] <- out$timetosurgery[idx]

  # Case 2
  idx <- out$surgery == 0 & out$fup_obs <= 182.62
  out$censoring[idx] <- 0
  out$fup_uncensored[idx] <- out$fup_obs[idx]

  # Case 3
  idx <- (out$surgery == 0 & out$fup_obs > 182.62) | (out$surgery == 1 & out$timetosurgery > 182.62)
  out$censoring[idx] <- ifelse(arm == "Surgery", 1, 0)
  out$fup_uncensored[idx] <- 182.62

  return(out)
}


tab <- rbind(
  clone(tab, "Control"),
  clone(tab, "Surgery")
)


# Bootstrap function
fboot <- function(tab, indices) {
  t <- tab[tab$arm == "Control", ]
  t1 <- tab[tab$arm == "Surgery", ]
  tab0 <- t[indices, ] # allows boot to select sample
  select <- tab0$id
  tab1 <- t1[t1$id %in% select, ]
  tab <- rbind(tab0, tab1)


  ####################################################
  # STEP 2-SPLITTING THE DATASET AT EACH TIME OF EVENT
  ####################################################

  # Dataframe containing the time of events and an ID for the times of events
  t_events <- sort(unique(tab$fup))
  times <- data.frame("tevent" = t_events, "ID_t" = seq(1:length(t_events)))

  process_data <- function(tab, t_events, times, arm = "Surgery") {
    # Filtering the dataset for the 'Surgery' arm
    tab_s <- tab[tab$arm == arm, ]

    # Creation of the entry variable (Tstart, 0 for everyone)
    tab_s$Tstart <- 0

    # Splitting the dataset at each time of event until the event happens and sorting it
    data.long <- survSplit(tab_s,
      cut = t_events, end = "fup",
      start = "Tstart", event = "outcome", id = "ID")
    data.long <- data.long[order(data.long$ID, data.long$fup), ]

    # Splitting the original dataset at each time of event and sorting it until censoring happens
    data.long.cens <- survSplit(tab_s,
      cut = t_events, end = "fup",
      start = "Tstart", event = "censoring", id = "ID")
    data.long.cens <- data.long.cens[order(data.long.cens$ID, data.long.cens$fup), ]

    # Replacing the censoring variable in data.long by the censoring variable obtained in the second split dataset
    data.long$censoring <- data.long.cens$censoring

    # Creating Tstop (end of the interval)
    data.long$Tstop <- data.long$fup

    # Merge with 'times' data and sort
    data.long <- merge(data.long, times, by.x = "Tstart", by.y = "tevent", all.x = TRUE)
    data.long <- data.long[order(data.long$ID, data.long$fup), ]
    data.long$ID_t[is.na(data.long$ID_t)] <- 0

    return(data.long)
  }


  data <- rbind(
    process_data(tab, t_events, times, arm = "Surgery"),
    process_data(tab, t_events, times, arm = "Control")
  )
  data_final <- merge(data, times, by = "ID_t", all.x = T)
  data_final <- data_final[order(data_final$ID, data_final$fup), ]


  ############################################
  # STEP 3- ESTIMATING THE CENSORING WEIGHTS
  ############################################

  censoring_weights <- function(data_final, arm = "Surgery") {
    data.long <- data_final[which(data_final$arm == arm), ]

    # STEP 1: censoring model
    # Cox model
    ms_cens <- coxph(Surv(Tstart, Tstop, censoring) ~ age + sex + emergency +
      stage + deprivation + charlson + perf, ties = "efron", data = data.long)

    data.long$risk <- predict(
      ms_cens,
      newdata = data.long,
      type = "risk",
      reference = "zero")

    # Estimating the cumulative hazard (when covariates=0)
    dat.base <- data.frame(basehaz(ms_cens, centered = F))
    names(dat.base) <- c("hazard", "t")
    dat.base <- unique(merge(dat.base, times, by.x = "t", by.y = "tevent", all.x = T))

    # Merging and reordering the dataset
    data.long <- merge(data.long, dat.base, by = "ID_t", all.x = T)
    data.long <- data.long[order(data.long$id, data.long$fup), ]
    data.long$hazard[is.na(data.long$hazard)] <- 0


    # Estimating the probability of remaining uncensored at each time of event
    data.long$P_uncens <- exp(-(data.long$hazard) * data.long$risk)

    # IPC Weights are the inverse of the probability of remaining uncensored
    data.long$weight_Cox <- 1 / data.long$P_uncens

    return(data.long)
  }

  data.long.Cox <- rbind(
    censoring_weights(data_final, arm = "Surgery"),
    censoring_weights(data_final, arm = "Control")
  )


  ############################################
  # STEP 3- ESTIMATING THE SURVIVOR FUNCTION
  ############################################

  ##################################################
  # Emulated trial with Cox weights (Kaplan-Meier)
  ##################################################

  emul_Cox_s <- survfit(Surv(Tstart, Tstop, outcome) ~ 1,
    data = data.long.Cox[data.long.Cox$arm == "Surgery", ], weights = weight_Cox)
  S1 <- min(emul_Cox_s$surv) # 1 year survival in the surgery group
  fit.tableM <- summary(emul_Cox_s, rmean = 365)$table
  RMST1 <- fit.tableM["*rmean"] # Estimated RMST in the surgery group

  emul_Cox_c <- survfit(Surv(Tstart, Tstop, outcome) ~ 1,
    data = data.long.Cox[data.long.Cox$arm == "Control", ], weights = weight_Cox)
  S0 <- min(emul_Cox_c$surv) # 1 year survival in the control group
  fit.tableM2 <- summary(emul_Cox_c, rmean = 365)$table
  RMST0 <- fit.tableM2["*rmean"] # Estimated RMST in the control group

  Diff_surv <- S1 - S0 # Difference in 1 year survival
  Diff_RMST <- RMST1 - RMST0 # Difference in RMST

  ##################################################
  # Emulated trial with Cox weights (Cox model)
  ##################################################

  Cox_w <- coxph(Surv(Tstart, Tstop, outcome) ~ arm,
    data = data.long.Cox, weights = weight_Cox)
  HR <- exp(Cox_w$coefficients) # Hazard ratio


  res <- c(Diff_surv, Diff_RMST, HR)
  names(res) <- c("Diff_surv", "Diff_RMST", "HR")

  return(res)
}


##############################################
# CALLING THE BOOTSTRAP FUNCTION
##############################################

# Bootstrapping with 1000 replications
results <- boot(data = tab, statistic = fboot, R = 100)
expect_equal(unname(results$t0[1]), .12, tolerance = .01) |> print()
expect_equal(unname(results$t0[3]), 0.6072863, tolerance = .01) |> print()

# 95% confidence intervals for each measure
# boot.ci(results,type="norm",index=1) #Difference in survival
# boot.ci(results,type="norm",index=2) #Difference in RMST
# boot.ci(results,type="norm",index=3) #HR
	# Downloads data here:

	# https://academic.oup.com/ije/article/49/5/1719/5835351#226179197


	#############################################################################
	# TRIAL EMULATION: SURGERY WITHIN 6 MONTHS AMONG OLDER LUNG CANCER PATIENTS

	# Author: Clemence Leyrat (from Camille Maringe's Stata code)
	# Refactored by Vincent Arel-Bundock on 2023-12-08
	# Content: recoding the outcome and follow-up time for our emulated trial
	# creating the censoring indicators and time uncensored

	#############################################################################


	# Packages
	library(survival) # For survival analysis
	library(tinytest)
	library(boot)
	set.seed(1024)

	tab <- read.csv("ije-2019-08-1035-File008.csv", sep = ",", header = T)

	###############################################################################################
	# VARIABLES

	# id: patient identifier
	# fup_obs: observed follow-up time (time to death or 1 year if censored alive)
	# death: observed event of interest (all-cause death) 1: dead, 0:alive
	# timetosurgery: time to surgery (NA if no surgery)
	# surgery: observed treatment 1 if the patient received surgery within 6 month, 0 otherwise
	# age: age at diagnosis
	# sex: patient's sex
	# perf: performance status at diagnosis
	# stage: stage at diagnosis
	# deprivation: deprivation score
	# charlson: Charlson's comorbidity index
	# emergency: route to diagnosis

	# Dataset: these variables are in a dataframe called "tab" (unavailable)

	###############################################################################################


	########################################################################
	# STEP 1-CLONING: CREATION OF OUTCOME AND FOLLOW-UP TIME IN EACH ARM
	########################################################################

	# Note: Letters A, B... refer to the scenarios identified in our graph
	# We will create two new variables:
	# - fup: the follow-up time in the emulated trial (which can be different from the observed follow-up time)
	# - outcome: the outcome in the emulated trial (which can be different from the observed follow-up time)

	# These variables will be the outcome and follow up time in the outcome model
	#########################################################################################################################################

	clone <- function(tab, arm = "Control") {
	# Create a copy of the dataset for the 'Control' arm
	out <- tab
	out$arm <- arm

	# Case 1: Patients receive surgery within 6 months
	idx <- which(out$surgery == 1 & out$timetosurgery <= 182.62)
	out$outcome[idx] <- ifelse(out$arm[idx] == "Control", 0, out$death[idx])
	out$fup[idx] <- ifelse(out$arm[idx] == "Control", out$timetosurgery[idx], out$fup_obs[idx])

	# Case 2: Patients do not receive surgery within 6 months
	if (arm == "Control") {
	idx <- which(out$surgery == 0 \| (out$surgery == 1 & out$timetosurgery > 182.62))
	out$outcome[idx] <- out$death[idx]
	out$fup[idx] <- out$fup_obs[idx]

	# Case 2: Patients die or are lost to follow-up before 6 months without having surgery
	} else if (arm == "Surgery") {
	idx <- which(out$surgery == 0 & out$fup_obs <= 182.62)
	out$outcome[idx] <- out$death[idx]
	out$fup[idx] <- out$fup_obs[idx]
	}

	# Case 3: Patients do not receive surgery within 6 months and are still alive or at risk at 6 months
	if (arm == "Surgery") {
	idx <- which((out$surgery == 0 & out$fup_obs > 182.62) \| (out$surgery == 1 & out$timetosurgery > 182.62))
	out$outcome[idx] <- 0
	out$fup[idx] <- 182.62
	}

	# Censoring status and follow-up time uncensored

	# Case 1
	idx <- out$surgery == 1 & out$timetosurgery <= 182.62
	out$censoring[idx] <- ifelse(arm == "Control", 1, 0)
	out$fup_uncensored[idx] <- out$timetosurgery[idx]

	# Case 2
	idx <- out$surgery == 0 & out$fup_obs <= 182.62
	out$censoring[idx] <- 0
	out$fup_uncensored[idx] <- out$fup_obs[idx]

	# Case 3
	idx <- (out$surgery == 0 & out$fup_obs > 182.62) \| (out$surgery == 1 & out$timetosurgery > 182.62)
	out$censoring[idx] <- ifelse(arm == "Surgery", 1, 0)
	out$fup_uncensored[idx] <- 182.62

	return(out)
	}


	tab <- rbind(
	clone(tab, "Control"),
	clone(tab, "Surgery")
	)


	# Bootstrap function
	fboot <- function(tab, indices) {
	t <- tab[tab$arm == "Control", ]
	t1 <- tab[tab$arm == "Surgery", ]
	tab0 <- t[indices, ] # allows boot to select sample
	select <- tab0$id
	tab1 <- t1[t1$id %in% select, ]
	tab <- rbind(tab0, tab1)


	####################################################
	# STEP 2-SPLITTING THE DATASET AT EACH TIME OF EVENT
	####################################################

	# Dataframe containing the time of events and an ID for the times of events
	t_events <- sort(unique(tab$fup))
	times <- data.frame("tevent" = t_events, "ID_t" = seq(1:length(t_events)))

	process_data <- function(tab, t_events, times, arm = "Surgery") {
	# Filtering the dataset for the 'Surgery' arm
	tab_s <- tab[tab$arm == arm, ]

	# Creation of the entry variable (Tstart, 0 for everyone)
	tab_s$Tstart <- 0

	# Splitting the dataset at each time of event until the event happens and sorting it
	data.long <- survSplit(tab_s,
	cut = t_events, end = "fup",
	start = "Tstart", event = "outcome", id = "ID")
	data.long <- data.long[order(data.long$ID, data.long$fup), ]

	# Splitting the original dataset at each time of event and sorting it until censoring happens
	data.long.cens <- survSplit(tab_s,
	cut = t_events, end = "fup",
	start = "Tstart", event = "censoring", id = "ID")
	data.long.cens <- data.long.cens[order(data.long.cens$ID, data.long.cens$fup), ]

	# Replacing the censoring variable in data.long by the censoring variable obtained in the second split dataset
	data.long$censoring <- data.long.cens$censoring

	# Creating Tstop (end of the interval)
	data.long$Tstop <- data.long$fup

	# Merge with 'times' data and sort
	data.long <- merge(data.long, times, by.x = "Tstart", by.y = "tevent", all.x = TRUE)
	data.long <- data.long[order(data.long$ID, data.long$fup), ]
	data.long$ID_t[is.na(data.long$ID_t)] <- 0

	return(data.long)
	}


	data <- rbind(
	process_data(tab, t_events, times, arm = "Surgery"),
	process_data(tab, t_events, times, arm = "Control")
	)
	data_final <- merge(data, times, by = "ID_t", all.x = T)
	data_final <- data_final[order(data_final$ID, data_final$fup), ]


	############################################
	# STEP 3- ESTIMATING THE CENSORING WEIGHTS
	############################################

	censoring_weights <- function(data_final, arm = "Surgery") {
	data.long <- data_final[which(data_final$arm == arm), ]

	# STEP 1: censoring model
	# Cox model
	ms_cens <- coxph(Surv(Tstart, Tstop, censoring) ~ age + sex + emergency +
	stage + deprivation + charlson + perf, ties = "efron", data = data.long)

	data.long$risk <- predict(
	ms_cens,
	newdata = data.long,
	type = "risk",
	reference = "zero")

	# Estimating the cumulative hazard (when covariates=0)
	dat.base <- data.frame(basehaz(ms_cens, centered = F))
	names(dat.base) <- c("hazard", "t")
	dat.base <- unique(merge(dat.base, times, by.x = "t", by.y = "tevent", all.x = T))

	# Merging and reordering the dataset
	data.long <- merge(data.long, dat.base, by = "ID_t", all.x = T)
	data.long <- data.long[order(data.long$id, data.long$fup), ]
	data.long$hazard[is.na(data.long$hazard)] <- 0


	# Estimating the probability of remaining uncensored at each time of event
	data.long$P_uncens <- exp(-(data.long$hazard) * data.long$risk)

	# IPC Weights are the inverse of the probability of remaining uncensored
	data.long$weight_Cox <- 1 / data.long$P_uncens

	return(data.long)
	}

	data.long.Cox <- rbind(
	censoring_weights(data_final, arm = "Surgery"),
	censoring_weights(data_final, arm = "Control")
	)


	############################################
	# STEP 3- ESTIMATING THE SURVIVOR FUNCTION
	############################################

	##################################################
	# Emulated trial with Cox weights (Kaplan-Meier)
	##################################################

	emul_Cox_s <- survfit(Surv(Tstart, Tstop, outcome) ~ 1,
	data = data.long.Cox[data.long.Cox$arm == "Surgery", ], weights = weight_Cox)
	S1 <- min(emul_Cox_s$surv) # 1 year survival in the surgery group
	fit.tableM <- summary(emul_Cox_s, rmean = 365)$table
	RMST1 <- fit.tableM["*rmean"] # Estimated RMST in the surgery group

	emul_Cox_c <- survfit(Surv(Tstart, Tstop, outcome) ~ 1,
	data = data.long.Cox[data.long.Cox$arm == "Control", ], weights = weight_Cox)
	S0 <- min(emul_Cox_c$surv) # 1 year survival in the control group
	fit.tableM2 <- summary(emul_Cox_c, rmean = 365)$table
	RMST0 <- fit.tableM2["*rmean"] # Estimated RMST in the control group

	Diff_surv <- S1 - S0 # Difference in 1 year survival
	Diff_RMST <- RMST1 - RMST0 # Difference in RMST

	##################################################
	# Emulated trial with Cox weights (Cox model)
	##################################################

	Cox_w <- coxph(Surv(Tstart, Tstop, outcome) ~ arm,
	data = data.long.Cox, weights = weight_Cox)
	HR <- exp(Cox_w$coefficients) # Hazard ratio


	res <- c(Diff_surv, Diff_RMST, HR)
	names(res) <- c("Diff_surv", "Diff_RMST", "HR")

	return(res)
	}


	##############################################
	# CALLING THE BOOTSTRAP FUNCTION
	##############################################

	# Bootstrapping with 1000 replications
	results <- boot(data = tab, statistic = fboot, R = 100)
	expect_equal(unname(results$t0[1]), .12, tolerance = .01) \|> print()
	expect_equal(unname(results$t0[3]), 0.6072863, tolerance = .01) \|> print()

	# 95% confidence intervals for each measure
	# boot.ci(results,type="norm",index=1) #Difference in survival
	# boot.ci(results,type="norm",index=2) #Difference in RMST
	# boot.ci(results,type="norm",index=3) #HR