Skip to content

Instantly share code, notes, and snippets.

@sternj
Created February 13, 2017 16:18
Show Gist options
  • Save sternj/97962180ece8758e7afe991c6444bd50 to your computer and use it in GitHub Desktop.
Save sternj/97962180ece8758e7afe991c6444bd50 to your computer and use it in GitHub Desktop.
Jonathan Stern part 1 AI
# Jane Smith project analysis.
rm(list = ls())
# Read in data ----
wd <- "~/"
rand_data <- read.csv(paste0(wd, "data/smith_rand.csv"), stringsAsFactors = F)
covar_data <- read.csv(paste0(wd, "data/smith_covariates.csv"), stringsAsFactors = F)
outcome_data <- read.csv(paste0(wd, "data/smith_outcomes.csv"), stringsAsFactors = F)
# Combine randomization, covariate, and survey outcome data
df <- merge(rand_data, covar_data, by = "ai_id")
df <- merge(df, outcome_data, by = "ai_id")
# Get rid of duplicated ai_ids
df <- df[!duplicated(df$ai_id),]
# Clean Data -----
# Take covariate data from covariate file
# error in df$gender assignment-- df$gender.x has frequent NA vals
# changed to df$gender.y, which has no NA vals
# (as tested with length(df$gender.y[is.na(df$gender.y)]))
df$gender <- df$gender.y
df$race <- df$race.x
df$age <- df$age.x
# Cut down number of race categories
df$race_clean <- NA
df$race_clean[df$race == "caucasian"] <- "0White"
df$race_clean[df$race == "black"] <- "1Black"
df$race_clean[df$race == "hispanic"] <- "2Hispanic"
df$race_clean[df$race == "asian" |
df$race == "middleEastern" |
df$race == "unknown"] <- "3Other"
# Bin age
df$age_bin <- NA
df$age_bin[df$age >= 18 & df$age < 34] <- "18-34"
df$age_bin[df$age >= 35 & df$age < 44] <- "35-44"
df$age_bin[df$age >= 45 & df$age < 64] <- "45-64"
df$age_bin[df$age >= 65 & df$age != 999 ] <- "65+"
df$age_bin[df$age == 999] <- "Unknown"
# Added additional category for 999-- placeholder for unknown age
# Something for me to note-- I spent a good bit of code making some of these categories factors
# when I was dealing with the analysis. If I could have remembered how, I would have imported some
# of the Strings as factors in the first place. I realize now, right before submission,
# that I could have made those columns factors. While my code works, that is one of the major improvements
# that I would make to it.
# Check for balance across assignment -----
# So I know we want to check that there is no statistically significant
# relationship between treatment and any of the covariates. I'm not sure
# how to do because there are 3 treatment categories.
tapply(df$gender.y, df$message_treat, table)
tapply(df$race_clean, df$message_treat, table)
tapply(df$age_bin, df$message_treat, table)
#Calculations can be run on these tables
# Estimate voter persuasion treatment effects -----
# Code outcome variable
df$democrat <- NA
df$democrat[df$outcome == "Democrat Jane Smith"] <- TRUE
df$democrat[df$outcome == "Republican Paul Jones"] <- FALSE
persuade_no_covar <- glm(democrat ~ message_treat, data = df, family = "binomial")
summary(persuade_no_covar)
# Control for race, age bins, gender, partisanship, and whether they turned
# out in 2014.
persuade_covar <- glm(democrat ~ message_treat + race_clean + age + gender +
partisanscore + turnout2014,
data = df, family = "binomial")
summary(persuade_covar)
# Looks like ProHealth was effective and ProTeach wasn't.
# Estimate voter mobilization treatment effects -----
turnout_no_covar <- glm(turnout2014 ~ message_treat, data = df,
family = "binomial")
summary(turnout_no_covar)
turnout_covar <- glm(turnout2014 ~ message_treat + race_clean + age + gender +
partisanscore,
data = df, family = "binomial")
summary(turnout_covar)
# Looks like both ProHealth and ProTeach increased turnout, but ProTeach by more.
# additional idea from me-- see if there is a relationship between any covar
# and a NA result.
# tested idea using following code
# tapply(tb$age_bin, tb$message_treat, table) [etc, using different covars]
# No significant difference witnessed. I'm not completely sure what NA
# means in this context, but it seems trivial, and even were it to be non-trivial
# it seems statistically insignificant
# Graphs ------
# I didn't have time to create the graphs for the presentation. Could you do it?
# I know it needs to have the treatment effect in all groups and 90% error bars.
# Created graphs on each of the two measures with & without covars
turnout_no_cov_tbl <- table(df$message_treat, df$turnout2014)
barplot(turnout_no_cov_tbl,beside = TRUE, legend = rownames(turnout_no_cov_tbl), xlab = "turnout", main = "Turnout Without Covars")
isDem_no_cov_tbl <- table(df$message_treat, df$democrat)
barplot(isDem_no_cov_tbl,beside = TRUE, legend = rownames(isDem_no_cov_tbl), xlab = "Is a democrat", main = "Voting Pattern Without Covars")
# Graphed mean values of non-NA turnout & outcome relative to each cofactor
# This hinges on the concept that booleans are treated in mean() as '1' or '0'.
# Recalling that gender is now gender.y, given NA values in gender.x
genderTurnoutSuccess <- tapply(df$turnout2014[!is.na(df$turnout2014)], list(df$message_treat[!is.na(df$turnout2014)], df$gender[!is.na(df$turnout2014)]), mean)
ageTurnoutSuccess <- tapply(df$turnout2014[!is.na(df$turnout2014)], list(df$message_treat[!is.na(df$turnout2014)],df$age_bin[!is.na(df$turnout2014)]),mean)
raceTurnoutSuccess <- tapply(df$turnout2014[!is.na(df$turnout2014)], list(df$message_treat[!is.na(df$turnout2014)], df$race_clean[!is.na(df$turnout2014)]),mean)
# Gender turnout Barplot
barplot(genderTurnoutSuccess, beside = TRUE, legend = rownames(genderTurnoutSuccess), xlab = "Gender", main = "Gender Turnout Success", ylab = "percentage")
# Age turnout barplot
barplot(ageTurnoutSuccess, beside = TRUE, legend = rownames(ageTurnoutSuccess), xlab = "Age", main = "Age Turnout Success", ylab = "percentage")
#Race turnout barplot
barplot(raceTurnoutSuccess, beside = TRUE, legend = rownames(raceTurnoutSuccess), xlab = "race", main = "Race Turnout Success", ylab = "percentage")
# Known cosmetic issue-- final bars covered by legend. I suspect that this is an easy fix, but not one that I really had time to make
# Outcome graphs-- same methodology
genderOutcome <- tapply(df$democrat[!is.na(df$democrat)], list(df$message_treat[!is.na(df$democrat)], df$gender[!is.na(df$democrat)]), mean)
ageOutcome <- tapply(df$democrat[!is.na(df$democrat)], list(df$message_treat[!is.na(df$democrat)], df$age_bin[!is.na(df$democrat)]), mean)
raceOutcome <- tapply(df$democrat[!is.na(df$democrat)], list(df$message_treat[!is.na(df$democrat)], df$race_clean[!is.na(df$democrat)]), mean)
barplot(genderOutcome, beside = TRUE, legend = rownames(genderOutcome), xlab = "Gender", ylab = "Percent Democrat Vote", main = "Voting Patterns by Gender")
barplot(ageOutcome, beside = TRUE, legend = rownames(ageOutcome), xlab = "Age Groups", ylab = "Percent Democrat Vote", main = "Voting Patterns by Age")
# Appears to be a large spike in voting democrat in 18-34 demographic in pro-health. Not sure if a fluke in my graphing or something significant
barplot(raceOutcome, beside = TRUE, legend = rownames(ageOutcome), xlab = "Race Groups", ylab = "Percent Democrat Vote", main = "Voting Patterns by Race")
# Data listed as present in environment not present in bar graphs. May be a factor of the behavior of the mean() function relative to NA values, though I
# subsetted specifically to avoid this, and used the clean data. Would review given time.
# I somewhat wish there was some more documentation on this data-- while the names are incredibly intuitive and immensely helpful,
# there seemed to be some edge/ placeholder values.
# Perhaps part of the point of this exercise was to see if we could recognize those values-- if so, I did my best to account for them
# in calculation.
@sternj
Copy link
Author

sternj commented Feb 13, 2017

image
image
image
image
image
image
image
image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment