Skip to content

Instantly share code, notes, and snippets.

@danlewer
Last active April 19, 2022 23:56
Show Gist options
  • Save danlewer/1c4fd37c8dd4ea11483236644890de63 to your computer and use it in GitHub Desktop.
Save danlewer/1c4fd37c8dd4ea11483236644890de63 to your computer and use it in GitHub Desktop.
# function requires data.table
library(data.table)
#------------------------
# exact matching function
#------------------------
# stratifies dataset and then selects random observations within strata
# data = dataset containing:
# - treatment/exposure variable 'mvar' (a string specifying variable name).
# - matching variable 'mvar' (a string specifying variable name). If you want to match on multiple variables, concatenate them first.
# other inputs are:
# - ratio of cases:controls (an integer > 0). You can also set ratio between 0 and 1, e.g. to 0.5 to get 2 cases/control, but the results aren't perfect
# - seed for fixing random selection of cases/controls (an integer; default NULL means no seed). Choice of seed is arbitrary. Seed will be reset globally at the end of the function (i.e. the function will overwrite a previous seed)
# returns data.table of matched observations, with additional variable 'id' for use in paired/grouped analyses
# the speed of the function mostly depends on the number of strata (i.e. levels of the matching variable)
# there is no problem converting the results to a standard data frame with data.frame(matched_data) if you prefer working in this way
smatch <- function (data, treat, mvar, ratio = 1, seed = NULL) {
targ <- data[, .(case = sum(get(treat)), control = sum(!get(treat))), mvar]
targ[, cst := floor(pmin(control / ratio, case))]
targ[, cnt := cst * ratio]
targ <- targ[cst > 0]
setnames(targ, mvar, 'mvar')
l2 <- cumsum(targ$cst)
ids <- mapply(':', c(0, l2[-nrow(targ)]), l2-1)
names(ids) <- targ$mvar
case <- NULL
control <- NULL
set.seed(seed)
on.exit(set.seed(NULL))
for(i in targ$mvar) {
case[[i]] <- data[get(treat) == T & get(mvar) == i][sample(.N, targ$cst[targ$mvar == i])]
case[[i]][, id := ids[[i]]]
control[[i]] <- data[get(treat) == F & get(mvar) == i][sample(.N, targ$cnt[targ$mvar == i])]
control[[i]][, id := rep(ids[[i]], each = ratio)]
}
rbindlist(c(case, control))
}
#--------
# example
#--------
# random dataset
N <- 5000
d <- data.table(sex = sample(c('M', 'F'), N, replace = T),
ageGroup = sample(1:5, N, replace = T, prob = (1:5)/15))
d[, exposed := rbinom(N, 1, ageGroup/7) == 1] # treatment variable
d[, PID := .I] # participant ID
d[, age_sex := paste0(sex, '-', ageGroup)]
# describe full dataset: exposure group is older
prop.table(table(d$age_sex, d$exposed), 2) * 100
# create matched dataset
matched_data <- smatch(d, 'exposed', 'age_sex') # gives a different matched set each time
matched_data <- smatch(d, 'exposed', 'age_sex', seed = 5) # gives the same matched set each time (the number 5 is arbitrary)
# check balance
dcast(matched_data, age_sex ~ exposed, value.var = 'sex', fun.aggregate = length)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment