soodoku/prop_weights.R

## prop_weights.R
"
Weighting by Propensity Scores
Last Edited: 5/31/2015

Task Outline:
1. Two datasets:
	dataset 1: large pop. representative sample
	dataset 2: convenient sample
2. Create weights for dataset 2 so that its marginals are close to dataset 1 on some vars.

How to:
1. Standardize coding of variables you want to match on across datasets
	- You can create a separate dummy variable for missing values for each variable in each dataset
2. Follow steps below
"

# Get some data
library(anesrake); data(anes04)
library(Zeling); data(voteincome)

"
Picking common cols + making data similar + add a column identifying which.data
"

# dat1
data1 <- anes04[,c("female", "age", "educcats")]
names(data1)[3] <- "education"
data1$education[data1$education==5] <- 4
data1 <- data.frame(pop=1, data1)

# dat2
data2 <- voteincome[,c("female", "age", "education")]
data2 <- data.frame(pop=0, data2)

# rbind data
data <- rbind(data1, data2)

# Fit Logistic
# Model this well
library(splines)
est  <- glm(I(pop==0) ~ female + ns(age,3)*as.factor(education), data=data, family=binomial(link="logit"))

# Get prop. scores
data$prop.scores <- est$linear.predictors

# summary(data$prop.scores)

# Break prop.scores into quintiles
# Use control group (pop.) for this
pop.quantiles <- quantile(data$prop.scores[data$pop == 1], probs = seq(0, 1, .2))
data$quantile <- cut(data$prop.scores, breaks=pop.quantiles, include.lowest=TRUE)
tbl <- prop.table(xtabs( ~ quantile + pop, data=data), 2)
mult <- tbl[,1] / tbl[,2]

# Create weights
data$new.weight <- ifelse(data$pop == 1,  mult[as.integer(data$quantile)], 1)

# Checking how the new weights work
# Age
summary(lm(age ~ pop, weight=new.weight, data=data))$coef[2,1]
# -0.8366
summary(lm(age ~ pop, data=data))$coef[2,1]
#  -1.9891

# Sex
summary(lm(female ~ pop, weight=new.weight, data=data))$coef[2,1]
# 0.009970668
summary(lm(female ~ pop, data=data))$coef[2,1]
#  -0.02633

# Education
summary(lm(education ~ pop, weight=new.weight, data=data))$coef[2,1]
# [1] 0.05121831
summary(lm(education ~ pop, data=data))$coef[2,1]
# [1] 0.2702838
	"
	Weighting by Propensity Scores
	Last Edited: 5/31/2015

	Task Outline:
	1. Two datasets:
	dataset 1: large pop. representative sample
	dataset 2: convenient sample
	2. Create weights for dataset 2 so that its marginals are close to dataset 1 on some vars.

	How to:
	1. Standardize coding of variables you want to match on across datasets
	- You can create a separate dummy variable for missing values for each variable in each dataset
	2. Follow steps below
	"

	# Get some data
	library(anesrake); data(anes04)
	library(Zeling); data(voteincome)

	"
	Picking common cols + making data similar + add a column identifying which.data
	"

	# dat1
	data1 <- anes04[,c("female", "age", "educcats")]
	names(data1)[3] <- "education"
	data1$education[data1$education==5] <- 4
	data1 <- data.frame(pop=1, data1)

	# dat2
	data2 <- voteincome[,c("female", "age", "education")]
	data2 <- data.frame(pop=0, data2)

	# rbind data
	data <- rbind(data1, data2)

	# Fit Logistic
	# Model this well
	library(splines)
	est <- glm(I(pop==0) ~ female + ns(age,3)*as.factor(education), data=data, family=binomial(link="logit"))

	# Get prop. scores
	data$prop.scores <- est$linear.predictors

	# summary(data$prop.scores)

	# Break prop.scores into quintiles
	# Use control group (pop.) for this
	pop.quantiles <- quantile(data$prop.scores[data$pop == 1], probs = seq(0, 1, .2))
	data$quantile <- cut(data$prop.scores, breaks=pop.quantiles, include.lowest=TRUE)
	tbl <- prop.table(xtabs( ~ quantile + pop, data=data), 2)
	mult <- tbl[,1] / tbl[,2]

	# Create weights
	data$new.weight <- ifelse(data$pop == 1, mult[as.integer(data$quantile)], 1)

	# Checking how the new weights work
	# Age
	summary(lm(age ~ pop, weight=new.weight, data=data))$coef[2,1]
	# -0.8366
	summary(lm(age ~ pop, data=data))$coef[2,1]
	# -1.9891

	# Sex
	summary(lm(female ~ pop, weight=new.weight, data=data))$coef[2,1]
	# 0.009970668
	summary(lm(female ~ pop, data=data))$coef[2,1]
	# -0.02633

	# Education
	summary(lm(education ~ pop, weight=new.weight, data=data))$coef[2,1]
	# [1] 0.05121831
	summary(lm(education ~ pop, data=data))$coef[2,1]
	# [1] 0.2702838