jamesdunham/smoother.R

## smoother.R
smooth_years <- function(d, varnames = NULL) {
  # Smooth variables over years using a loess interpolation
  #
  # param d: a dataframe
  # param varnames: the names of the variables to smooth over;
  #   if NULL names will be guessed.
  #
  # Returns a data.frame 2.5x larger than the input,
  # containing the original values and predicted values from a
  # loess fit. Assumes that a variable called 'year' exists.

  if (!length(varnames)) {
    # varnames is NULL, so expect names matching this pattern
    varnames = grep("median|ci_min|ci_max", names(d), value = TRUE)
    stopifnot(length(varnames) == 3)
  }
  # add three new periods between each pair of years, e.g.
  # 2001.25, 2001.5, 2001.75
  new_data = data.frame(year = seq(min(d$year), max(d$year), 0.25))
  for (varname in varnames) {
    # bandwidth of .25 works well in original data; adjust as needed
    fit = loess(paste0(varname, "~ year"), d, span = 0.25)
    new_data[, varname] = predict(fit, newdata = new_data)
  }
  new_data
}

# If the input data.frame were something like the below, we might
# want to fit and predict separately for each state-pid.
#    state_abb pid3 year median_theta_bar      ci_min     ci_max
# 1:        ID    D 1947       -0.1407980  -1.5669650  0.9186605
# 2:        ID    D 1948        0.1992394  -0.6396129  1.0706514

# one way to accomplish this is with dplyr's do()
library(dplyr)
library(data.table)

medians %>%
  group_by(state, pid3) %>%
  do(fit = smooth_years(.)) %>%
  apply(1, data.frame) %>%
  rbindlist()
	smooth_years <- function(d, varnames = NULL) {
	# Smooth variables over years using a loess interpolation
	#
	# param d: a dataframe
	# param varnames: the names of the variables to smooth over;
	# if NULL names will be guessed.
	#
	# Returns a data.frame 2.5x larger than the input,
	# containing the original values and predicted values from a
	# loess fit. Assumes that a variable called 'year' exists.

	if (!length(varnames)) {
	# varnames is NULL, so expect names matching this pattern
	varnames = grep("median\|ci_min\|ci_max", names(d), value = TRUE)
	stopifnot(length(varnames) == 3)
	}
	# add three new periods between each pair of years, e.g.
	# 2001.25, 2001.5, 2001.75
	new_data = data.frame(year = seq(min(d$year), max(d$year), 0.25))
	for (varname in varnames) {
	# bandwidth of .25 works well in original data; adjust as needed
	fit = loess(paste0(varname, "~ year"), d, span = 0.25)
	new_data[, varname] = predict(fit, newdata = new_data)
	}
	new_data
	}

	# If the input data.frame were something like the below, we might
	# want to fit and predict separately for each state-pid.
	# state_abb pid3 year median_theta_bar ci_min ci_max
	# 1: ID D 1947 -0.1407980 -1.5669650 0.9186605
	# 2: ID D 1948 0.1992394 -0.6396129 1.0706514

	# one way to accomplish this is with dplyr's do()
	library(dplyr)
	library(data.table)

	medians %>%
	group_by(state, pid3) %>%
	do(fit = smooth_years(.)) %>%
	apply(1, data.frame) %>%
	rbindlist()