Created
February 13, 2017 16:18
-
-
Save sternj/97962180ece8758e7afe991c6444bd50 to your computer and use it in GitHub Desktop.
Jonathan Stern part 1 AI
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Jane Smith project analysis. | |
rm(list = ls()) | |
# Read in data ---- | |
wd <- "~/" | |
rand_data <- read.csv(paste0(wd, "data/smith_rand.csv"), stringsAsFactors = F) | |
covar_data <- read.csv(paste0(wd, "data/smith_covariates.csv"), stringsAsFactors = F) | |
outcome_data <- read.csv(paste0(wd, "data/smith_outcomes.csv"), stringsAsFactors = F) | |
# Combine randomization, covariate, and survey outcome data | |
df <- merge(rand_data, covar_data, by = "ai_id") | |
df <- merge(df, outcome_data, by = "ai_id") | |
# Get rid of duplicated ai_ids | |
df <- df[!duplicated(df$ai_id),] | |
# Clean Data ----- | |
# Take covariate data from covariate file | |
# error in df$gender assignment-- df$gender.x has frequent NA vals | |
# changed to df$gender.y, which has no NA vals | |
# (as tested with length(df$gender.y[is.na(df$gender.y)])) | |
df$gender <- df$gender.y | |
df$race <- df$race.x | |
df$age <- df$age.x | |
# Cut down number of race categories | |
df$race_clean <- NA | |
df$race_clean[df$race == "caucasian"] <- "0White" | |
df$race_clean[df$race == "black"] <- "1Black" | |
df$race_clean[df$race == "hispanic"] <- "2Hispanic" | |
df$race_clean[df$race == "asian" | | |
df$race == "middleEastern" | | |
df$race == "unknown"] <- "3Other" | |
# Bin age | |
df$age_bin <- NA | |
df$age_bin[df$age >= 18 & df$age < 34] <- "18-34" | |
df$age_bin[df$age >= 35 & df$age < 44] <- "35-44" | |
df$age_bin[df$age >= 45 & df$age < 64] <- "45-64" | |
df$age_bin[df$age >= 65 & df$age != 999 ] <- "65+" | |
df$age_bin[df$age == 999] <- "Unknown" | |
# Added additional category for 999-- placeholder for unknown age | |
# Something for me to note-- I spent a good bit of code making some of these categories factors | |
# when I was dealing with the analysis. If I could have remembered how, I would have imported some | |
# of the Strings as factors in the first place. I realize now, right before submission, | |
# that I could have made those columns factors. While my code works, that is one of the major improvements | |
# that I would make to it. | |
# Check for balance across assignment ----- | |
# So I know we want to check that there is no statistically significant | |
# relationship between treatment and any of the covariates. I'm not sure | |
# how to do because there are 3 treatment categories. | |
tapply(df$gender.y, df$message_treat, table) | |
tapply(df$race_clean, df$message_treat, table) | |
tapply(df$age_bin, df$message_treat, table) | |
#Calculations can be run on these tables | |
# Estimate voter persuasion treatment effects ----- | |
# Code outcome variable | |
df$democrat <- NA | |
df$democrat[df$outcome == "Democrat Jane Smith"] <- TRUE | |
df$democrat[df$outcome == "Republican Paul Jones"] <- FALSE | |
persuade_no_covar <- glm(democrat ~ message_treat, data = df, family = "binomial") | |
summary(persuade_no_covar) | |
# Control for race, age bins, gender, partisanship, and whether they turned | |
# out in 2014. | |
persuade_covar <- glm(democrat ~ message_treat + race_clean + age + gender + | |
partisanscore + turnout2014, | |
data = df, family = "binomial") | |
summary(persuade_covar) | |
# Looks like ProHealth was effective and ProTeach wasn't. | |
# Estimate voter mobilization treatment effects ----- | |
turnout_no_covar <- glm(turnout2014 ~ message_treat, data = df, | |
family = "binomial") | |
summary(turnout_no_covar) | |
turnout_covar <- glm(turnout2014 ~ message_treat + race_clean + age + gender + | |
partisanscore, | |
data = df, family = "binomial") | |
summary(turnout_covar) | |
# Looks like both ProHealth and ProTeach increased turnout, but ProTeach by more. | |
# additional idea from me-- see if there is a relationship between any covar | |
# and a NA result. | |
# tested idea using following code | |
# tapply(tb$age_bin, tb$message_treat, table) [etc, using different covars] | |
# No significant difference witnessed. I'm not completely sure what NA | |
# means in this context, but it seems trivial, and even were it to be non-trivial | |
# it seems statistically insignificant | |
# Graphs ------ | |
# I didn't have time to create the graphs for the presentation. Could you do it? | |
# I know it needs to have the treatment effect in all groups and 90% error bars. | |
# Created graphs on each of the two measures with & without covars | |
turnout_no_cov_tbl <- table(df$message_treat, df$turnout2014) | |
barplot(turnout_no_cov_tbl,beside = TRUE, legend = rownames(turnout_no_cov_tbl), xlab = "turnout", main = "Turnout Without Covars") | |
isDem_no_cov_tbl <- table(df$message_treat, df$democrat) | |
barplot(isDem_no_cov_tbl,beside = TRUE, legend = rownames(isDem_no_cov_tbl), xlab = "Is a democrat", main = "Voting Pattern Without Covars") | |
# Graphed mean values of non-NA turnout & outcome relative to each cofactor | |
# This hinges on the concept that booleans are treated in mean() as '1' or '0'. | |
# Recalling that gender is now gender.y, given NA values in gender.x | |
genderTurnoutSuccess <- tapply(df$turnout2014[!is.na(df$turnout2014)], list(df$message_treat[!is.na(df$turnout2014)], df$gender[!is.na(df$turnout2014)]), mean) | |
ageTurnoutSuccess <- tapply(df$turnout2014[!is.na(df$turnout2014)], list(df$message_treat[!is.na(df$turnout2014)],df$age_bin[!is.na(df$turnout2014)]),mean) | |
raceTurnoutSuccess <- tapply(df$turnout2014[!is.na(df$turnout2014)], list(df$message_treat[!is.na(df$turnout2014)], df$race_clean[!is.na(df$turnout2014)]),mean) | |
# Gender turnout Barplot | |
barplot(genderTurnoutSuccess, beside = TRUE, legend = rownames(genderTurnoutSuccess), xlab = "Gender", main = "Gender Turnout Success", ylab = "percentage") | |
# Age turnout barplot | |
barplot(ageTurnoutSuccess, beside = TRUE, legend = rownames(ageTurnoutSuccess), xlab = "Age", main = "Age Turnout Success", ylab = "percentage") | |
#Race turnout barplot | |
barplot(raceTurnoutSuccess, beside = TRUE, legend = rownames(raceTurnoutSuccess), xlab = "race", main = "Race Turnout Success", ylab = "percentage") | |
# Known cosmetic issue-- final bars covered by legend. I suspect that this is an easy fix, but not one that I really had time to make | |
# Outcome graphs-- same methodology | |
genderOutcome <- tapply(df$democrat[!is.na(df$democrat)], list(df$message_treat[!is.na(df$democrat)], df$gender[!is.na(df$democrat)]), mean) | |
ageOutcome <- tapply(df$democrat[!is.na(df$democrat)], list(df$message_treat[!is.na(df$democrat)], df$age_bin[!is.na(df$democrat)]), mean) | |
raceOutcome <- tapply(df$democrat[!is.na(df$democrat)], list(df$message_treat[!is.na(df$democrat)], df$race_clean[!is.na(df$democrat)]), mean) | |
barplot(genderOutcome, beside = TRUE, legend = rownames(genderOutcome), xlab = "Gender", ylab = "Percent Democrat Vote", main = "Voting Patterns by Gender") | |
barplot(ageOutcome, beside = TRUE, legend = rownames(ageOutcome), xlab = "Age Groups", ylab = "Percent Democrat Vote", main = "Voting Patterns by Age") | |
# Appears to be a large spike in voting democrat in 18-34 demographic in pro-health. Not sure if a fluke in my graphing or something significant | |
barplot(raceOutcome, beside = TRUE, legend = rownames(ageOutcome), xlab = "Race Groups", ylab = "Percent Democrat Vote", main = "Voting Patterns by Race") | |
# Data listed as present in environment not present in bar graphs. May be a factor of the behavior of the mean() function relative to NA values, though I | |
# subsetted specifically to avoid this, and used the clean data. Would review given time. | |
# I somewhat wish there was some more documentation on this data-- while the names are incredibly intuitive and immensely helpful, | |
# there seemed to be some edge/ placeholder values. | |
# Perhaps part of the point of this exercise was to see if we could recognize those values-- if so, I did my best to account for them | |
# in calculation. |
Author
sternj
commented
Feb 13, 2017
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment