Created
December 18, 2013 02:57
-
-
Save mihirzaveri/8016606 to your computer and use it in GitHub Desktop.
The (patchy) R code I used to sketch a final project for my UC Berkeley data visualization class in fall '13.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
setwd("Documents/grad_school/dataviz-fall-2013/final-project/working/") | |
options(stringsAsFactors = FALSE) | |
legislator_ids <- read.csv("legislators-current.csv") | |
govtrack_ID <- legislator_ids[,c(1,2,23)] | |
govtrack_ID$state <- as.character(legislator_ids$state) | |
brown_sherrod <- read.csv("legislator_csvs/400050.csv") | |
brown_sherrod_2013 <- subset(brown_sherrod, brown_sherrod$session == 2013) | |
sum(brown_sherrod_2013$missed_votes) | |
links<- dir("mihir2") | |
data <- NULL | |
load_data <- function(id) { | |
thing <- paste("mihir2/",id,sep="") | |
return(data) | |
} | |
for (i in links) { | |
df <- read.csv(load_data(i)) | |
df$file <- i | |
data <- rbind(data, df) | |
} | |
house_data <- subset(data, data$chamber == "h") | |
senate_data <- subset(data, data$chamber == "s") | |
house_2013 <- subset(house_data, house_data$session == 2013) | |
id_split <- strsplit(house_2013$file, "[.]") | |
get_first_element <- function(element) { | |
element[1] | |
} | |
house_2013$id_only <- sapply(id_split, get_first_element) | |
house_2013_totals <- aggregate(house_2013$missed_votes, by = list(house_2013$id_only), sum) | |
names(house_2013_totals) <- c("id", "missed_votes") | |
h2013_totals_ordered <- house_2013_totals[order(house_2013_totals$missed_votes, decreasing = T),] | |
na_totals <- subset(h2013_totals_ordered, is.na(h2013_totals_ordered$last_name) == TRUE) | |
nonfactor_h2013totals <- h2013_totals_ordered[,1:4] | |
rownames(nonfactor_h2013totals) <- 1:436 | |
nonfactor_h2013totals$last_name[6] <- "Young" | |
nonfactor_h2013totals$first_name[6] <- "Bill" | |
nonfactor_h2013totals$last_name[78] <- "Bonner" | |
nonfactor_h2013totals$first_name[78] <- "Jo" | |
nonfactor_h2013totals$last_name[200] <- "Emerson" | |
nonfactor_h2013totals$first_name[200] <- "Jo Ann" | |
final_2013_data <- nonfactor_h2013totals | |
#swing_last_names <- read.csv("swing_last_names.csv") | |
#swing_last_names <- as.vector(swing_last_names) | |
#final_2013_data[,"swing"] <- NA | |
#steve king | |
#final_2013_data$swing[153] <- TRUE | |
#swing district data from Daily Kos | |
swing_districts <- read.csv("swing_legislators - Sheet1.csv", header = FALSE) | |
names(swing_districts) <- c("district", "legislator", "party","obama12","romney12","obama08","mccain08") | |
swing_districts$diff12 <- swing_districts$obama12 - swing_districts$romney12 | |
swing_actual <- subset(swing_districts, (swing_districts$diff12 > -5) & (swing_districts$diff12 < 5)) | |
swing_actual <- swing_actual[,c(2,8)] | |
swing_actual$leg_nonfactor <- as.character(swing_actual$legislator) | |
swing_split <- strsplit(swing_actual$leg_nonfactor,",") | |
library(plyr) | |
swing_split_names <- ldply(swing_split) | |
names(swing_split_names) <- c("last", "first") | |
swing_actual$last <- swing_split_names$last | |
swing_actual$first <- swing_split_names$first | |
final_2013_working <- final_2013_data | |
final_2013_working$combined_name <- paste(final_2013_working$last_name, ","," ", final_2013_working$first_name, sep = "") | |
#swing-join section | |
#default is all legislators are not from swing, then we will mark the swing with TRUEs in the swing column | |
final_2013_working$swing <- FALSE | |
#where do the swing legislators show up in the missed vote data? | |
match_order <- match(swing_actual$leg_nonfactor, final_2013_working$combined_name) | |
# FIRST: tackle the ones that the matching worked | |
match_order_nona <- match_order[!is.na(match_order)] | |
#how many are there (to find how many times looping over) | |
length(match_order_nona) | |
for (i in 1:38){ | |
final_2013_working[match_order_nona[i],"swing"] <- TRUE | |
} | |
# why does the above for loop work when final_2013_working$swing[match_order_nona,] <- TRUE not work | |
#SECOND: find the ones that did not work in matching | |
match(swing_actual$leg_nonfactor, final_2013_working$combined_name) | |
#indexes of NA values in match_order | |
non_matches <- which(is.na(match_order)) | |
swing_actual[non_matches,] | |
#indexes of the non matches in final_2013_working, have to manually check these: 189, 234, 59, 303,mat 430, 352, 431, 419, 291, 136 | |
subset(final_2013_working, final_2013_working$last_name == "McKeon") #checks out | |
subset(final_2013_working, final_2013_working$last_name == "Royce") #checks out | |
subset(final_2013_working, final_2013_working$last_name == "Rogers") #two mike rogers, first mike rogers (400342) is correct | |
subset(final_2013_working, final_2013_working$last_name == "Walz") #checks out | |
subset(final_2013_working, final_2013_working$last_name == "Heck") #checks out | |
subset(final_2013_working, final_2013_working$last_name == "Bishop") #checks out | |
subset(final_2013_working, final_2013_working$last_name == "Meehan") #checks out | |
subset(final_2013_working, final_2013_working$last_name == "Dent") #checks out | |
subset(final_2013_working, final_2013_working$last_name == "Rigell") #checks out | |
subset(final_2013_working, final_2013_working$last_name == "Forbes") #checks out | |
final_2013_working$swing[non_match_index] <- TRUE | |
colors <- rep("yellow", 436) | |
swing_numbers <- which(final_2013_working$swing == TRUE) | |
colors[swing_numbers] <- "cyan" | |
barplot(final_2013_working$missed_votes, col = colors) | |
#break it out by state | |
final_2013_working$id <- as.numeric(final_2013_working$id) | |
id_data$govtrack_id <- as.numeric(id_data$govtrack_id) | |
id_match_numbers <- match(final_2013_working$id, id_data$govtrack_id) | |
final_2013_working$state <- id_data$state[id_match_numbers] | |
#may or may not need to do the following, may have failed to load properly from earlier | |
final_2013_working$last_name <- as.character(final_2013_working$last_name) | |
final_2013_working$first_name <- as.character(final_2013_working$first_name) | |
final_2013_working$last_name[78] <- "Bonner" | |
final_2013_working$first_name[78] <- "Jo" | |
final_2013_working$last_name[200] <- "Emerson" | |
final_2013_working$first_name[200] <- "Jo Ann" | |
final_2013_working$last_name[205] <- "Alexander" | |
final_2013_working$first_name[205] <- "Rodney" | |
#fixing the NAs, Bill Young who died | |
final_2013_working$state[6] <- "FL" | |
#jo bonner, AL 1st | |
final_2013_working$state[78] <- "AL" | |
#jo ann emerson, MO 8th | |
final_2013_working$state[200] <- "MO" | |
#rodney alexander | |
[[code here]] | |
#trying out a function to port over the no match differences | |
differences <- function(last_names, indexes, data){ | |
for (i in 1:9){ | |
diff <- subset(swing_actual, swing_actual$last == last_names[i])[,"diff12"] | |
data[indexes[i],"diff12"] <- diff | |
} | |
} | |
#failing at function, manually inserting difference values for no matches | |
> subset(swing_actual, swing_actual$diff12 == "McKeon") | |
[1] legislator diff12 leg_nonfactor last first | |
<0 rows> (or 0-length row.names) | |
> subset(swing_actual, swing_actual$diff12 == "Royce") | |
[1] legislator diff12 leg_nonfactor last first | |
<0 rows> (or 0-length row.names) | |
> subset(swing_actual, swing_actual$last == "McKeon") | |
legislator diff12 leg_nonfactor last first | |
46 McKeon, Buck -1.9 McKeon, Buck McKeon Buck | |
> subset(swing_actual, swing_actual$last == "Royce") | |
legislator diff12 leg_nonfactor last first | |
60 Royce, Ed -3.7 Royce, Ed Royce Ed | |
> subset(swing_actual, swing_actual$last == "Rogers") | |
legislator diff12 leg_nonfactor last first | |
206 Rogers, Mike J. -3.1 Rogers, Mike J. Rogers Mike J. | |
> subset(swing_actual, swing_actual$last == "Walz") | |
legislator diff12 leg_nonfactor last first | |
213 Walz, Tim 1.4 Walz, Tim Walz Tim | |
> subset(swing_actual, swing_actual$last == "Heck") | |
legislator diff12 leg_nonfactor last first | |
270 Heck, Joe 0.8 Heck, Joe Heck Joe | |
> subset(swing_actual, swing_actual$last == "Bishop") | |
legislator diff12 leg_nonfactor last first | |
272 Bishop, Tim 0.5 Bishop, Tim Bishop Tim | |
> subset(swing_actual, swing_actual$last == "Meehan") | |
legislator diff12 leg_nonfactor last first | |
331 Meehan, Pat -1.9 Meehan, Pat Meehan Pat | |
> subset(swing_actual, swing_actual$last == "Dent") | |
legislator diff12 leg_nonfactor last first | |
339 Dent, Charlie -2.9 Dent, Charlie Dent Charlie | |
> subset(swing_actual, swing_actual$last == "Rigell") | |
legislator diff12 leg_nonfactor last first | |
403 Rigell, Scott 1.5 Rigell, Scott Rigell Scott | |
> subset(swing_actual, swing_actual$last == "Forbes") | |
legislator diff12 leg_nonfactor last first | |
405 Forbes, Randy -1.3 Forbes, Randy Forbes Randy | |
diff_numbers <- c(-1.9, -3.7, -3.1, 1.4, .8, .5, -1.9, -2.9, 1.5, -1.3) | |
final_nonmatch_index <- c(189, 234, 59, 303, 430, 352, 431, 419, 291, 136) | |
final_2013_working[final_nonmatch_index, ]$diff12 <- diff_numbers | |
library(ggplot2) | |
ggplot(final_2013_working, aes(x=abs(final_2013_working$diff12), y=final_2013_working$missed_votes)) + geom_point(shape=1) + geom_smooth(method=lm) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment