Skip to content

Instantly share code, notes, and snippets.

@mcshaz
Last active July 16, 2020 21:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mcshaz/b4793c19096669a89ef83522437ec79d to your computer and use it in GitHub Desktop.
Save mcshaz/b4793c19096669a89ef83522437ec79d to your computer and use it in GitHub Desktop.
Comparing Survival of a Sample to a Standard Population (Finkelstein D, Muzikansky A, Schoenfeld D) using R
# see http://hedwig.mgh.harvard.edu/biostatistics/node/30 for the original paper and an excel spreadsheet with macros
# the spreadsheet does much the same in VBA, but with considerably more code
# unlike the vba spreadsheet, this also takes into acount the fraction of age in years at entry and exit (in creating per patient risk)
# if the age is greater than the maximum data (101 years old), each subsequent year is a repeat of the final annual risk available
library(survival)
oneSampleSurvival <- function(df, description) {
# NZ data between 0 and 100 for each year
# 10 years of population and death data was averaged (2009-2018) for each year of age until 89 years old
# data is then repeated in 5 yearly blocks as more granular data was hard to obtain
# mortality at 0 years excludes mortality before 28 days (not relevant)
maleMortality <- c(0.001843361228536, 0.000426012398328, 0.000205162390909, 0.000159933000221, 0.000131106236558, 0.000109847867736, 7.54877026981029E-05, 0.000104275266973, 8.46092661589332E-05, 0.000105655560873, 9.83646791092508E-05, 9.85469708619511E-05, 0.000107171040591, 0.000136745787619, 0.000211693246977, 0.000409680498681, 0.00047908243582, 0.00068725838029, 0.000792558666334, 0.00084616668714, 0.000794068160984, 0.000907776641227, 0.000800008665019, 0.000828096770177, 0.000806997128346, 0.000840005277716, 0.000802987629156, 0.000825605870321, 0.000765479018619, 0.000763137039117, 0.000806246482394, 0.000903749143428, 0.000924492304174, 0.000857232269108, 0.00092020098225, 0.000998230025349, 0.001085433512175, 0.001027818977332, 0.001122918474693, 0.00119988265766, 0.00137785701308, 0.001442949518615, 0.00153845943647, 0.00164270871642, 0.00188256662831, 0.002023312990214, 0.002030541981486, 0.002272865187846, 0.002607033588397, 0.002696261347275, 0.002969802371852, 0.003330445145048, 0.003450994308472, 0.003766824109133, 0.004382106432976, 0.004619214946603, 0.00507076963448, 0.005168789456058, 0.006025729167394, 0.006502154100548, 0.006825708117681, 0.00758116723691, 0.008280373380387, 0.009053044757868, 0.00965343239435, 0.010853634625976, 0.012058271659544, 0.013235643983316, 0.014237071816918, 0.016323379469387, 0.01765435043422, 0.019773974552901, 0.021931935366736, 0.024825087322466, 0.027006375178927, 0.029820680952757, 0.033902362368077, 0.03837648923878, 0.042932432116485, 0.046894126378063, 0.054108256726034, 0.060411289976837, 0.068653770714218, 0.077960340417016, 0.088759284493938, 0.099283037280737, 0.112244152400086, 0.128687561933202, 0.144916978613787, 0.157930508606968, 0.229411764705882, 0.229411764705882, 0.229411764705882, 0.229411764705882, 0.229411764705882, 0.362290502793296, 0.362290502793296, 0.362290502793296, 0.362290502793296, 0.362290502793296, 0.4025)
femaleMortality <- c(0.001440725276587, 0.000378599537757, 0.00021709924675, 9.97502891438584E-05, 9.85006425048784E-05, 8.92050090982164E-05, 8.95206096251493E-05, 7.96114330270264E-05, 6.92762143799792E-05, 0.000101432919309, 5.08040463311748E-05, 9.22608680532037E-05, 0.000122942485987, 0.000153294565916, 0.000172502569892, 0.000260513869301, 0.000255523432279, 0.000368255538044, 0.000356330469863, 0.000267139191191, 0.000358230732596, 0.000369833546194, 0.000305476109726, 0.000319588939803, 0.000297956887714, 0.000363010043724, 0.000362859918462, 0.000432002032646, 0.000321026034948, 0.000361625954016, 0.00040452437153, 0.000431745162574, 0.000512770712544, 0.00054317376104, 0.000488063712367, 0.000486563128689, 0.00057339122496, 0.00072400070304, 0.00072060329081, 0.000754517692419, 0.000849463968011, 0.000831645042867, 0.001042677971853, 0.001132957559664, 0.001207180709918, 0.00131585417895, 0.001513946743796, 0.001606617997058, 0.001723479078906, 0.001933808631927, 0.002226989986555, 0.002231269823877, 0.002626438738956, 0.002696431092572, 0.002966902852488, 0.003080837093037, 0.003413401888315, 0.003427147560834, 0.004010391720794, 0.004217567593114, 0.004606013461916, 0.005031607028952, 0.005503746741787, 0.006108782143772, 0.006845816438536, 0.007141344621113, 0.007955395638506, 0.009127032877493, 0.010042400111416, 0.010803760162666, 0.012057728392066, 0.01323141371974, 0.014562716593054, 0.015873768931767, 0.018880147634345, 0.02056309402693, 0.022568564901875, 0.025811635811795, 0.029461369991122, 0.032920253916872, 0.038426882965907, 0.042458470397799, 0.050308505460449, 0.05592788582796, 0.063922777668064, 0.074695945529539, 0.084828676018797, 0.095857321745606, 0.110779421454886, 0.129589230549757, 0.185325602140946, 0.185325602140946, 0.185325602140946, 0.185325602140946, 0.185325602140946, 0.320109190172884, 0.320109190172884, 0.320109190172884, 0.320109190172884, 0.320109190172884, 0.444805194805195)
getYrMortality <- function(startAgeYr, endAgeYr, isMale){
if (isMale) {
lookupVector <- maleMortality
} else {
lookupVector <- femaleMortality
}
startIndx <- as.integer(startAgeYr) + 1L # add 1 as mortality begins at 0 years of age (died <28d excluded)
endIndx <- as.integer(endAgeYr) + 1L
if (endIndx > length(lookupVector)) {
lastIndx <- length(lookupVector)
returnVar <- c(lookupVector[startIndx:lastIndx], rep(lookupVector[lastIndx], endIndx - lastIndx))
} else {
returnVar <- lookupVector[startIndx:endIndx]
}
return(returnVar)
}
smrProportionalHazard <- function(observed, expected, ci = 95) {
stat <- (observed - expected) ^ 2 / expected
smr <- observed / expected
chiterm <- qchisq((100 + ci) / 200, 1)
ub <- smr + chiterm / (2 * expected) + ((chiterm * (4 * observed + chiterm)) ^ 0.5) / (2 * expected)
lb <- smr + chiterm / (2 * expected) - ((chiterm * (4 * observed + chiterm)) ^ 0.5) / (2 * expected)
return(c("smr" = smr,
"p_value" = 1 - pchisq(stat, 1),
"lower_ci" = lb,
"upper_ci" = ub))
}
riskOfDeath <- by(df, seq_len(nrow(df)), function(rw){
# extracts mortality data for each year of age between entry and exit date
# first and last date will be fractionally reduced according to how much of thay age year was remaining
startAgeYr <- as.numeric(rw$entry - rw$dob) / 365.25
startProportion <- startAgeYr %% 1
startAgeYr <- as.integer(startAgeYr)
endAgeYr <- as.numeric(rw$exit - rw$dob) / 365.25
endProportion <- endAgeYr %% 1
endAgeYr <- as.integer(endAgeYr)
returnVar <- getYrMortality(startAgeYr, endAgeYr, rw$male)
if (startAgeYr == endAgeYr) {
returnVar <- returnVar * (endProportion - startProportion)
} else {
returnVar[1] <- returnVar[1] * (1 - startProportion)
len <- length(returnVar)
returnVar[len] <- returnVar[len] * endProportion
}
return(sum(returnVar))
})
#main output
cat(description, "(using proportional hazards assumption)", "\n")
totalDeaths <- sum(df$dead)
totalRisk <- sum(riskOfDeath)
cat("observed / expected =", totalDeaths, "/", totalRisk, "\n")
print(smrProportionalHazard(totalDeaths, totalRisk))
df$yrsOldAtEntry <- as.numeric(df$entry - df$dob) / 365.25
df$yrsInStudy <- as.numeric(df$exit - df$entry) / 365.25
# create reference for population hazard
maxYears <- ceiling(max(df$yrsInStudy))
hazards <- by(df, seq_len(nrow(df)), function(rw){
mortVector <- getYrMortality(rw$yrsOldAtEntry, rw$yrsOldAtEntry + maxYears, rw$male)
return(exp(-cumsum(mortVector)))
})
hazards <- c(1, rowMeans(simplify2array(hazards)))
survData <- survfit(Surv(df$yrsInStudy, df$dead) ~ 1, data = df)
lowerY <- floor(survData$lower[length(survData$lower)] * 100) / 100
par(cex.sub = 0.8)
plot(survData,
xlab = "Years from op",
ylab = "Survival probability",
ylim = c(lowerY, 1),
sub = description)
lines(0:(as.integer(maxYears) + 1L), hazards, col="grey")
legend("bottomleft",
inset = 0.05,
legend = c("general pop. matched for age/sex", "cardiac", "95% conf. int."),
col = c("grey", "black", "black"),
lty = c(1, 1, 2))
}
df <- byfirstsurg[, c("Male", "Dead", "DateofBirth", "DateOfEvent", "DatelastKnown")]
# rename variables to required dataframe names for oneSampleSurvival function
names(df) <- c("male", "dead", "dob", "entry", "exit")
df$male <- df$male == 1
df$dead <- df$dead == 1
# looking at subsets of the data - trying to identify where/which subset the proportional hazard assumption might hold
oneSampleSurvival (df[df$exit - df$entry > 30,], "surviving >30 days from operation")
oneSampleSurvival (df[df$exit - df$entry > 30 & df$entry - df$dob > 365.25 * 15,], "surviving >30 days and >14 yr old at operation")
oneSampleSurvival (df[df$exit - df$entry > 365,], "surviving at least 1 yr from operation")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment