Last active
April 19, 2022 23:56
-
-
Save danlewer/1c4fd37c8dd4ea11483236644890de63 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# function requires data.table | |
library(data.table) | |
#------------------------ | |
# exact matching function | |
#------------------------ | |
# stratifies dataset and then selects random observations within strata | |
# data = dataset containing: | |
# - treatment/exposure variable 'mvar' (a string specifying variable name). | |
# - matching variable 'mvar' (a string specifying variable name). If you want to match on multiple variables, concatenate them first. | |
# other inputs are: | |
# - ratio of cases:controls (an integer > 0). You can also set ratio between 0 and 1, e.g. to 0.5 to get 2 cases/control, but the results aren't perfect | |
# - seed for fixing random selection of cases/controls (an integer; default NULL means no seed). Choice of seed is arbitrary. Seed will be reset globally at the end of the function (i.e. the function will overwrite a previous seed) | |
# returns data.table of matched observations, with additional variable 'id' for use in paired/grouped analyses | |
# the speed of the function mostly depends on the number of strata (i.e. levels of the matching variable) | |
# there is no problem converting the results to a standard data frame with data.frame(matched_data) if you prefer working in this way | |
smatch <- function (data, treat, mvar, ratio = 1, seed = NULL) { | |
targ <- data[, .(case = sum(get(treat)), control = sum(!get(treat))), mvar] | |
targ[, cst := floor(pmin(control / ratio, case))] | |
targ[, cnt := cst * ratio] | |
targ <- targ[cst > 0] | |
setnames(targ, mvar, 'mvar') | |
l2 <- cumsum(targ$cst) | |
ids <- mapply(':', c(0, l2[-nrow(targ)]), l2-1) | |
names(ids) <- targ$mvar | |
case <- NULL | |
control <- NULL | |
set.seed(seed) | |
on.exit(set.seed(NULL)) | |
for(i in targ$mvar) { | |
case[[i]] <- data[get(treat) == T & get(mvar) == i][sample(.N, targ$cst[targ$mvar == i])] | |
case[[i]][, id := ids[[i]]] | |
control[[i]] <- data[get(treat) == F & get(mvar) == i][sample(.N, targ$cnt[targ$mvar == i])] | |
control[[i]][, id := rep(ids[[i]], each = ratio)] | |
} | |
rbindlist(c(case, control)) | |
} | |
#-------- | |
# example | |
#-------- | |
# random dataset | |
N <- 5000 | |
d <- data.table(sex = sample(c('M', 'F'), N, replace = T), | |
ageGroup = sample(1:5, N, replace = T, prob = (1:5)/15)) | |
d[, exposed := rbinom(N, 1, ageGroup/7) == 1] # treatment variable | |
d[, PID := .I] # participant ID | |
d[, age_sex := paste0(sex, '-', ageGroup)] | |
# describe full dataset: exposure group is older | |
prop.table(table(d$age_sex, d$exposed), 2) * 100 | |
# create matched dataset | |
matched_data <- smatch(d, 'exposed', 'age_sex') # gives a different matched set each time | |
matched_data <- smatch(d, 'exposed', 'age_sex', seed = 5) # gives the same matched set each time (the number 5 is arbitrary) | |
# check balance | |
dcast(matched_data, age_sex ~ exposed, value.var = 'sex', fun.aggregate = length) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment