danlewer/exact_match.R

## exact_match.R
# function requires data.table

library(data.table)

#------------------------
# exact matching function
#------------------------

# stratifies dataset and then selects random observations within strata
# data = dataset containing:
# - treatment/exposure variable 'mvar' (a string specifying variable name).
# - matching variable 'mvar' (a string specifying variable name). If you want to match on multiple variables, concatenate them first.
# other inputs are:
# - ratio of cases:controls (an integer > 0). You can also set ratio between 0 and 1, e.g. to 0.5 to get 2 cases/control, but the results aren't perfect
# - seed for fixing random selection of cases/controls (an integer; default NULL means no seed). Choice of seed is arbitrary. Seed will be reset globally at the end of the function (i.e. the function will overwrite a previous seed)
# returns data.table of matched observations, with additional variable 'id' for use in paired/grouped analyses
# the speed of the function mostly depends on the number of strata (i.e. levels of the matching variable)
# there is no problem converting the results to a standard data frame with data.frame(matched_data) if you prefer working in this way

smatch <- function (data, treat, mvar, ratio = 1, seed = NULL) {
  targ <- data[, .(case = sum(get(treat)), control = sum(!get(treat))), mvar]
  targ[, cst := floor(pmin(control / ratio, case))]
  targ[, cnt := cst * ratio]
  targ <- targ[cst > 0]
  setnames(targ, mvar, 'mvar')
  l2 <- cumsum(targ$cst)
  ids <- mapply(':', c(0, l2[-nrow(targ)]), l2-1)
  names(ids) <- targ$mvar
  case <- NULL
  control <- NULL
  set.seed(seed)
  on.exit(set.seed(NULL))
  for(i in targ$mvar) {
    case[[i]] <- data[get(treat) == T & get(mvar) == i][sample(.N, targ$cst[targ$mvar == i])]
    case[[i]][, id := ids[[i]]]
    control[[i]] <- data[get(treat) == F & get(mvar) == i][sample(.N, targ$cnt[targ$mvar == i])]
    control[[i]][, id := rep(ids[[i]], each = ratio)]
  }
  rbindlist(c(case, control))
}

#--------
# example
#--------

# random dataset

N <- 5000
d <- data.table(sex = sample(c('M', 'F'), N, replace = T),
                ageGroup = sample(1:5, N, replace = T, prob = (1:5)/15))
d[, exposed := rbinom(N, 1, ageGroup/7) == 1] # treatment variable
d[, PID := .I] # participant ID
d[, age_sex := paste0(sex, '-', ageGroup)]

# describe full dataset: exposure group is older

prop.table(table(d$age_sex, d$exposed), 2) * 100

# create matched dataset

matched_data <- smatch(d, 'exposed', 'age_sex') # gives a different matched set each time
matched_data <- smatch(d, 'exposed', 'age_sex', seed = 5) # gives the same matched set each time (the number 5 is arbitrary)

# check balance

dcast(matched_data, age_sex ~ exposed, value.var = 'sex', fun.aggregate = length)
	# function requires data.table

	library(data.table)

	#------------------------
	# exact matching function
	#------------------------

	# stratifies dataset and then selects random observations within strata
	# data = dataset containing:
	# - treatment/exposure variable 'mvar' (a string specifying variable name).
	# - matching variable 'mvar' (a string specifying variable name). If you want to match on multiple variables, concatenate them first.
	# other inputs are:
	# - ratio of cases:controls (an integer > 0). You can also set ratio between 0 and 1, e.g. to 0.5 to get 2 cases/control, but the results aren't perfect
	# - seed for fixing random selection of cases/controls (an integer; default NULL means no seed). Choice of seed is arbitrary. Seed will be reset globally at the end of the function (i.e. the function will overwrite a previous seed)
	# returns data.table of matched observations, with additional variable 'id' for use in paired/grouped analyses
	# the speed of the function mostly depends on the number of strata (i.e. levels of the matching variable)
	# there is no problem converting the results to a standard data frame with data.frame(matched_data) if you prefer working in this way

	smatch <- function (data, treat, mvar, ratio = 1, seed = NULL) {
	targ <- data[, .(case = sum(get(treat)), control = sum(!get(treat))), mvar]
	targ[, cst := floor(pmin(control / ratio, case))]
	targ[, cnt := cst * ratio]
	targ <- targ[cst > 0]
	setnames(targ, mvar, 'mvar')
	l2 <- cumsum(targ$cst)
	ids <- mapply(':', c(0, l2[-nrow(targ)]), l2-1)
	names(ids) <- targ$mvar
	case <- NULL
	control <- NULL
	set.seed(seed)
	on.exit(set.seed(NULL))
	for(i in targ$mvar) {
	case[[i]] <- data[get(treat) == T & get(mvar) == i][sample(.N, targ$cst[targ$mvar == i])]
	case[[i]][, id := ids[[i]]]
	control[[i]] <- data[get(treat) == F & get(mvar) == i][sample(.N, targ$cnt[targ$mvar == i])]
	control[[i]][, id := rep(ids[[i]], each = ratio)]
	}
	rbindlist(c(case, control))
	}

	#--------
	# example
	#--------

	# random dataset

	N <- 5000
	d <- data.table(sex = sample(c('M', 'F'), N, replace = T),
	ageGroup = sample(1:5, N, replace = T, prob = (1:5)/15))
	d[, exposed := rbinom(N, 1, ageGroup/7) == 1] # treatment variable
	d[, PID := .I] # participant ID
	d[, age_sex := paste0(sex, '-', ageGroup)]

	# describe full dataset: exposure group is older

	prop.table(table(d$age_sex, d$exposed), 2) * 100

	# create matched dataset

	matched_data <- smatch(d, 'exposed', 'age_sex') # gives a different matched set each time
	matched_data <- smatch(d, 'exposed', 'age_sex', seed = 5) # gives the same matched set each time (the number 5 is arbitrary)

	# check balance

	dcast(matched_data, age_sex ~ exposed, value.var = 'sex', fun.aggregate = length)