sternj/Stern_Jonathan_Part1.R

## Stern_Jonathan_Part1.R
# Jane Smith project analysis.
rm(list = ls())

# Read in data ----
wd <- "~/"
rand_data <- read.csv(paste0(wd, "data/smith_rand.csv"), stringsAsFactors = F)
covar_data <- read.csv(paste0(wd, "data/smith_covariates.csv"), stringsAsFactors = F)
outcome_data <- read.csv(paste0(wd, "data/smith_outcomes.csv"), stringsAsFactors = F)

# Combine randomization, covariate, and survey outcome data
df <- merge(rand_data, covar_data, by = "ai_id")
df <- merge(df, outcome_data, by = "ai_id")

# Get rid of duplicated ai_ids
df <- df[!duplicated(df$ai_id),]

# Clean Data -----
# Take covariate data from covariate file
# error in df$gender assignment-- df$gender.x has frequent NA vals
# changed to df$gender.y, which has no NA vals
# (as tested with length(df$gender.y[is.na(df$gender.y)]))
df$gender <- df$gender.y
df$race <- df$race.x
df$age <- df$age.x

# Cut down number of race categories
df$race_clean <- NA
df$race_clean[df$race == "caucasian"] <- "0White"
df$race_clean[df$race == "black"] <- "1Black"
df$race_clean[df$race == "hispanic"] <- "2Hispanic"
df$race_clean[df$race == "asian" |
                df$race == "middleEastern" |
                df$race == "unknown"] <- "3Other"

# Bin age
df$age_bin <- NA
df$age_bin[df$age >= 18 & df$age < 34] <- "18-34"
df$age_bin[df$age >= 35 & df$age < 44] <- "35-44"
df$age_bin[df$age >= 45 & df$age < 64] <- "45-64"
df$age_bin[df$age >= 65 & df$age != 999 ] <- "65+"
df$age_bin[df$age == 999] <- "Unknown"
# Added additional category for 999-- placeholder for unknown age

# Something for me to note-- I spent a good bit of code making some of these categories factors
# when I was dealing with the analysis. If I could have remembered how, I would have imported some
# of the Strings as factors in the first place. I realize now, right before submission,
# that I could have made those columns factors. While my code works, that is one of the major improvements
# that I would make to it.

# Check for balance across assignment -----
# So I know we want to check that there is no statistically significant
# relationship between treatment and any of the covariates. I'm not sure
# how to do because there are 3 treatment categories.
tapply(df$gender.y, df$message_treat, table)
tapply(df$race_clean, df$message_treat, table)
tapply(df$age_bin, df$message_treat, table)

#Calculations can be run on these tables

# Estimate voter persuasion treatment effects -----
# Code outcome variable
df$democrat <- NA
df$democrat[df$outcome == "Democrat Jane Smith"] <- TRUE
df$democrat[df$outcome == "Republican Paul Jones"] <- FALSE

persuade_no_covar <- glm(democrat ~ message_treat, data = df, family = "binomial")
summary(persuade_no_covar)

# Control for race, age bins, gender, partisanship, and whether they turned
# out in 2014.
persuade_covar <- glm(democrat ~ message_treat + race_clean + age + gender +
                        partisanscore + turnout2014,
                      data = df, family = "binomial")
summary(persuade_covar)

# Looks like ProHealth was effective and ProTeach wasn't.

# Estimate voter mobilization treatment effects -----

turnout_no_covar <- glm(turnout2014 ~ message_treat, data = df,
                        family = "binomial")
summary(turnout_no_covar)

turnout_covar <- glm(turnout2014 ~ message_treat + race_clean + age + gender +
                       partisanscore,
                     data = df, family = "binomial")
summary(turnout_covar)

# Looks like both ProHealth and ProTeach increased turnout, but ProTeach by more.

# additional idea from me-- see if there is a relationship between any covar
# and a NA result.

# tested idea using following code
# tapply(tb$age_bin, tb$message_treat, table) [etc, using different covars]
# No significant difference witnessed. I'm not completely sure what NA
# means in this context, but it seems trivial, and even were it to be non-trivial
# it seems statistically insignificant

# Graphs ------
# I didn't have time to create the graphs for the presentation. Could you do it?
# I know it needs to have the treatment effect in all groups and 90% error bars.

# Created graphs on each of the two measures with & without covars
turnout_no_cov_tbl <- table(df$message_treat, df$turnout2014)
barplot(turnout_no_cov_tbl,beside = TRUE, legend = rownames(turnout_no_cov_tbl), xlab = "turnout", main = "Turnout Without Covars")
isDem_no_cov_tbl <- table(df$message_treat, df$democrat)
barplot(isDem_no_cov_tbl,beside = TRUE, legend = rownames(isDem_no_cov_tbl), xlab = "Is a democrat", main = "Voting Pattern Without Covars")

# Graphed mean values of non-NA turnout & outcome relative to each cofactor
# This hinges on the concept that booleans are treated in mean() as '1' or '0'.
# Recalling that gender is now gender.y, given NA values in gender.x

genderTurnoutSuccess <- tapply(df$turnout2014[!is.na(df$turnout2014)], list(df$message_treat[!is.na(df$turnout2014)], df$gender[!is.na(df$turnout2014)]), mean)
ageTurnoutSuccess <- tapply(df$turnout2014[!is.na(df$turnout2014)], list(df$message_treat[!is.na(df$turnout2014)],df$age_bin[!is.na(df$turnout2014)]),mean)
raceTurnoutSuccess <- tapply(df$turnout2014[!is.na(df$turnout2014)], list(df$message_treat[!is.na(df$turnout2014)], df$race_clean[!is.na(df$turnout2014)]),mean)

# Gender turnout Barplot

barplot(genderTurnoutSuccess, beside = TRUE, legend = rownames(genderTurnoutSuccess), xlab = "Gender", main = "Gender Turnout Success", ylab = "percentage")

# Age turnout barplot

barplot(ageTurnoutSuccess, beside = TRUE, legend = rownames(ageTurnoutSuccess), xlab = "Age", main = "Age Turnout Success", ylab = "percentage")

#Race turnout barplot

barplot(raceTurnoutSuccess, beside = TRUE, legend = rownames(raceTurnoutSuccess), xlab = "race", main = "Race Turnout Success", ylab = "percentage")

# Known cosmetic issue-- final bars covered by legend. I suspect that this is an easy fix, but not one that I really had time to make
# Outcome graphs-- same methodology

genderOutcome <-  tapply(df$democrat[!is.na(df$democrat)], list(df$message_treat[!is.na(df$democrat)], df$gender[!is.na(df$democrat)]), mean)
ageOutcome <- tapply(df$democrat[!is.na(df$democrat)], list(df$message_treat[!is.na(df$democrat)], df$age_bin[!is.na(df$democrat)]), mean)
raceOutcome <- tapply(df$democrat[!is.na(df$democrat)], list(df$message_treat[!is.na(df$democrat)], df$race_clean[!is.na(df$democrat)]), mean)

barplot(genderOutcome, beside = TRUE, legend = rownames(genderOutcome), xlab = "Gender", ylab = "Percent Democrat Vote", main = "Voting Patterns by Gender")
barplot(ageOutcome, beside = TRUE, legend = rownames(ageOutcome), xlab = "Age Groups", ylab = "Percent Democrat Vote", main = "Voting Patterns by Age")
# Appears to be a large spike in voting democrat in 18-34 demographic in pro-health. Not sure if a fluke in my graphing or something significant

barplot(raceOutcome, beside = TRUE, legend = rownames(ageOutcome), xlab = "Race Groups", ylab = "Percent Democrat Vote", main = "Voting Patterns by Race")

# Data listed as present in environment not present in bar graphs. May be a factor of the behavior of the mean() function relative to NA values, though I
# subsetted specifically to avoid this, and used the clean data. Would review given time.

# I somewhat wish there was some more documentation on this data-- while the names are incredibly intuitive and immensely helpful,
# there seemed to be some edge/ placeholder values.
# Perhaps part of the point of this exercise was to see if we could recognize those values-- if so, I did my best to account for them
# in calculation.
	# Jane Smith project analysis.
	rm(list = ls())

	# Read in data ----
	wd <- "~/"
	rand_data <- read.csv(paste0(wd, "data/smith_rand.csv"), stringsAsFactors = F)
	covar_data <- read.csv(paste0(wd, "data/smith_covariates.csv"), stringsAsFactors = F)
	outcome_data <- read.csv(paste0(wd, "data/smith_outcomes.csv"), stringsAsFactors = F)

	# Combine randomization, covariate, and survey outcome data
	df <- merge(rand_data, covar_data, by = "ai_id")
	df <- merge(df, outcome_data, by = "ai_id")

	# Get rid of duplicated ai_ids
	df <- df[!duplicated(df$ai_id),]

	# Clean Data -----
	# Take covariate data from covariate file
	# error in df$gender assignment-- df$gender.x has frequent NA vals
	# changed to df$gender.y, which has no NA vals
	# (as tested with length(df$gender.y[is.na(df$gender.y)]))
	df$gender <- df$gender.y
	df$race <- df$race.x
	df$age <- df$age.x

	# Cut down number of race categories
	df$race_clean <- NA
	df$race_clean[df$race == "caucasian"] <- "0White"
	df$race_clean[df$race == "black"] <- "1Black"
	df$race_clean[df$race == "hispanic"] <- "2Hispanic"
	df$race_clean[df$race == "asian" \|
	df$race == "middleEastern" \|
	df$race == "unknown"] <- "3Other"

	# Bin age
	df$age_bin <- NA
	df$age_bin[df$age >= 18 & df$age < 34] <- "18-34"
	df$age_bin[df$age >= 35 & df$age < 44] <- "35-44"
	df$age_bin[df$age >= 45 & df$age < 64] <- "45-64"
	df$age_bin[df$age >= 65 & df$age != 999 ] <- "65+"
	df$age_bin[df$age == 999] <- "Unknown"
	# Added additional category for 999-- placeholder for unknown age

	# Something for me to note-- I spent a good bit of code making some of these categories factors
	# when I was dealing with the analysis. If I could have remembered how, I would have imported some
	# of the Strings as factors in the first place. I realize now, right before submission,
	# that I could have made those columns factors. While my code works, that is one of the major improvements
	# that I would make to it.

	# Check for balance across assignment -----
	# So I know we want to check that there is no statistically significant
	# relationship between treatment and any of the covariates. I'm not sure
	# how to do because there are 3 treatment categories.
	tapply(df$gender.y, df$message_treat, table)
	tapply(df$race_clean, df$message_treat, table)
	tapply(df$age_bin, df$message_treat, table)

	#Calculations can be run on these tables

	# Estimate voter persuasion treatment effects -----
	# Code outcome variable
	df$democrat <- NA
	df$democrat[df$outcome == "Democrat Jane Smith"] <- TRUE
	df$democrat[df$outcome == "Republican Paul Jones"] <- FALSE

	persuade_no_covar <- glm(democrat ~ message_treat, data = df, family = "binomial")
	summary(persuade_no_covar)

	# Control for race, age bins, gender, partisanship, and whether they turned
	# out in 2014.
	persuade_covar <- glm(democrat ~ message_treat + race_clean + age + gender +
	partisanscore + turnout2014,
	data = df, family = "binomial")
	summary(persuade_covar)

	# Looks like ProHealth was effective and ProTeach wasn't.

	# Estimate voter mobilization treatment effects -----

	turnout_no_covar <- glm(turnout2014 ~ message_treat, data = df,
	family = "binomial")
	summary(turnout_no_covar)

	turnout_covar <- glm(turnout2014 ~ message_treat + race_clean + age + gender +
	partisanscore,
	data = df, family = "binomial")
	summary(turnout_covar)

	# Looks like both ProHealth and ProTeach increased turnout, but ProTeach by more.

	# additional idea from me-- see if there is a relationship between any covar
	# and a NA result.

	# tested idea using following code
	# tapply(tb$age_bin, tb$message_treat, table) [etc, using different covars]
	# No significant difference witnessed. I'm not completely sure what NA
	# means in this context, but it seems trivial, and even were it to be non-trivial
	# it seems statistically insignificant

	# Graphs ------
	# I didn't have time to create the graphs for the presentation. Could you do it?
	# I know it needs to have the treatment effect in all groups and 90% error bars.

	# Created graphs on each of the two measures with & without covars
	turnout_no_cov_tbl <- table(df$message_treat, df$turnout2014)
	barplot(turnout_no_cov_tbl,beside = TRUE, legend = rownames(turnout_no_cov_tbl), xlab = "turnout", main = "Turnout Without Covars")
	isDem_no_cov_tbl <- table(df$message_treat, df$democrat)
	barplot(isDem_no_cov_tbl,beside = TRUE, legend = rownames(isDem_no_cov_tbl), xlab = "Is a democrat", main = "Voting Pattern Without Covars")

	# Graphed mean values of non-NA turnout & outcome relative to each cofactor
	# This hinges on the concept that booleans are treated in mean() as '1' or '0'.
	# Recalling that gender is now gender.y, given NA values in gender.x

	genderTurnoutSuccess <- tapply(df$turnout2014[!is.na(df$turnout2014)], list(df$message_treat[!is.na(df$turnout2014)], df$gender[!is.na(df$turnout2014)]), mean)
	ageTurnoutSuccess <- tapply(df$turnout2014[!is.na(df$turnout2014)], list(df$message_treat[!is.na(df$turnout2014)],df$age_bin[!is.na(df$turnout2014)]),mean)
	raceTurnoutSuccess <- tapply(df$turnout2014[!is.na(df$turnout2014)], list(df$message_treat[!is.na(df$turnout2014)], df$race_clean[!is.na(df$turnout2014)]),mean)

	# Gender turnout Barplot

	barplot(genderTurnoutSuccess, beside = TRUE, legend = rownames(genderTurnoutSuccess), xlab = "Gender", main = "Gender Turnout Success", ylab = "percentage")

	# Age turnout barplot

	barplot(ageTurnoutSuccess, beside = TRUE, legend = rownames(ageTurnoutSuccess), xlab = "Age", main = "Age Turnout Success", ylab = "percentage")

	#Race turnout barplot

	barplot(raceTurnoutSuccess, beside = TRUE, legend = rownames(raceTurnoutSuccess), xlab = "race", main = "Race Turnout Success", ylab = "percentage")

	# Known cosmetic issue-- final bars covered by legend. I suspect that this is an easy fix, but not one that I really had time to make
	# Outcome graphs-- same methodology

	genderOutcome <- tapply(df$democrat[!is.na(df$democrat)], list(df$message_treat[!is.na(df$democrat)], df$gender[!is.na(df$democrat)]), mean)
	ageOutcome <- tapply(df$democrat[!is.na(df$democrat)], list(df$message_treat[!is.na(df$democrat)], df$age_bin[!is.na(df$democrat)]), mean)
	raceOutcome <- tapply(df$democrat[!is.na(df$democrat)], list(df$message_treat[!is.na(df$democrat)], df$race_clean[!is.na(df$democrat)]), mean)

	barplot(genderOutcome, beside = TRUE, legend = rownames(genderOutcome), xlab = "Gender", ylab = "Percent Democrat Vote", main = "Voting Patterns by Gender")
	barplot(ageOutcome, beside = TRUE, legend = rownames(ageOutcome), xlab = "Age Groups", ylab = "Percent Democrat Vote", main = "Voting Patterns by Age")
	# Appears to be a large spike in voting democrat in 18-34 demographic in pro-health. Not sure if a fluke in my graphing or something significant

	barplot(raceOutcome, beside = TRUE, legend = rownames(ageOutcome), xlab = "Race Groups", ylab = "Percent Democrat Vote", main = "Voting Patterns by Race")

	# Data listed as present in environment not present in bar graphs. May be a factor of the behavior of the mean() function relative to NA values, though I
	# subsetted specifically to avoid this, and used the clean data. Would review given time.

	# I somewhat wish there was some more documentation on this data-- while the names are incredibly intuitive and immensely helpful,
	# there seemed to be some edge/ placeholder values.
	# Perhaps part of the point of this exercise was to see if we could recognize those values-- if so, I did my best to account for them
	# in calculation.