ngreifer/subclass_split.R

## subclass_split.R
# Implements the subclass splitting algorithm described by Imbens & Rubin (2015, Sec 13.5)
# Arguments:
# - ps: a vector of (linearized) propensity scores
# - z: a vector of treatment status (2 values, doesn't have to be 0/1)
# - tmax: the threshold of the t-statistic used to determine whether imbalance remains and
#         s plit should be formed. High values make splits less likely.
# - minn: the minimum number of units of each treatment group allowed in each subclass
# - focal: the treatment group where the subclass-wise median ps is computed; leave
#          NULL to use the full sample
#
# Output: a factor variable containing subclass membership

subclass_split <- function(ps, z, tmax = 1, minn = 3, focal = NULL) {

  k <- 1
  s <- rep(k, length(z))
  zz <- z[1]

  improved <- TRUE

  while (improved) {

    improved <- FALSE

    for (i in 1:k) {
      split <- TRUE
      m <- if (is.null(focal)) median(ps[s==i]) else median(ps[s==i & z == focal])

      #Check subclass size
      if (!all(c(sum(s == i & z == zz & ps < m),
                 sum(s == i & z == zz & ps >= m),
                 sum(s == i & z != zz & ps < m),
                 sum(s == i & z != zz & ps >= m)) >= minn))
        split <- FALSE

      #Check independence
      if (split) {
        l0 = mean(ps[z == zz & s == i])
        l1 = mean(ps[z != zz & s == i])
        v = (1/(sum(s == i) - 2))*(sum((ps[z == zz & s == i] - l0)^2) + sum((ps[z!= zz & s == i] - l1)^2))
        t = abs(l1 - l0)/sqrt(v*(1/sum(z == zz & s == i) + 1/sum(z != zz & s == i)))
        if (t <= tmax) split <- FALSE
      }

      if (split) {
        s[s==i & ps >= m] <- k
        k <- k + 1
        improved <- TRUE
      }
    }
  }

  s <- factor(s)
  levels(s) <- 1:(k - 1)

  return(s)
}
	# Implements the subclass splitting algorithm described by Imbens & Rubin (2015, Sec 13.5)
	# Arguments:
	# - ps: a vector of (linearized) propensity scores
	# - z: a vector of treatment status (2 values, doesn't have to be 0/1)
	# - tmax: the threshold of the t-statistic used to determine whether imbalance remains and
	# s plit should be formed. High values make splits less likely.
	# - minn: the minimum number of units of each treatment group allowed in each subclass
	# - focal: the treatment group where the subclass-wise median ps is computed; leave
	# NULL to use the full sample
	#
	# Output: a factor variable containing subclass membership

	subclass_split <- function(ps, z, tmax = 1, minn = 3, focal = NULL) {

	k <- 1
	s <- rep(k, length(z))
	zz <- z[1]

	improved <- TRUE

	while (improved) {

	improved <- FALSE

	for (i in 1:k) {
	split <- TRUE
	m <- if (is.null(focal)) median(ps[s==i]) else median(ps[s==i & z == focal])

	#Check subclass size
	if (!all(c(sum(s == i & z == zz & ps < m),
	sum(s == i & z == zz & ps >= m),
	sum(s == i & z != zz & ps < m),
	sum(s == i & z != zz & ps >= m)) >= minn))
	split <- FALSE

	#Check independence
	if (split) {
	l0 = mean(ps[z == zz & s == i])
	l1 = mean(ps[z != zz & s == i])
	v = (1/(sum(s == i) - 2))*(sum((ps[z == zz & s == i] - l0)^2) + sum((ps[z!= zz & s == i] - l1)^2))
	t = abs(l1 - l0)/sqrt(v*(1/sum(z == zz & s == i) + 1/sum(z != zz & s == i)))
	if (t <= tmax) split <- FALSE
	}

	if (split) {
	s[s==i & ps >= m] <- k
	k <- k + 1
	improved <- TRUE
	}
	}
	}

	s <- factor(s)
	levels(s) <- 1:(k - 1)

	return(s)
	}