Skip to content

Instantly share code, notes, and snippets.

@mihirzaveri
Created December 18, 2013 02:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mihirzaveri/8016606 to your computer and use it in GitHub Desktop.
Save mihirzaveri/8016606 to your computer and use it in GitHub Desktop.
The (patchy) R code I used to sketch a final project for my UC Berkeley data visualization class in fall '13.
setwd("Documents/grad_school/dataviz-fall-2013/final-project/working/")
options(stringsAsFactors = FALSE)
legislator_ids <- read.csv("legislators-current.csv")
govtrack_ID <- legislator_ids[,c(1,2,23)]
govtrack_ID$state <- as.character(legislator_ids$state)
brown_sherrod <- read.csv("legislator_csvs/400050.csv")
brown_sherrod_2013 <- subset(brown_sherrod, brown_sherrod$session == 2013)
sum(brown_sherrod_2013$missed_votes)
links<- dir("mihir2")
data <- NULL
load_data <- function(id) {
thing <- paste("mihir2/",id,sep="")
return(data)
}
for (i in links) {
df <- read.csv(load_data(i))
df$file <- i
data <- rbind(data, df)
}
house_data <- subset(data, data$chamber == "h")
senate_data <- subset(data, data$chamber == "s")
house_2013 <- subset(house_data, house_data$session == 2013)
id_split <- strsplit(house_2013$file, "[.]")
get_first_element <- function(element) {
element[1]
}
house_2013$id_only <- sapply(id_split, get_first_element)
house_2013_totals <- aggregate(house_2013$missed_votes, by = list(house_2013$id_only), sum)
names(house_2013_totals) <- c("id", "missed_votes")
h2013_totals_ordered <- house_2013_totals[order(house_2013_totals$missed_votes, decreasing = T),]
na_totals <- subset(h2013_totals_ordered, is.na(h2013_totals_ordered$last_name) == TRUE)
nonfactor_h2013totals <- h2013_totals_ordered[,1:4]
rownames(nonfactor_h2013totals) <- 1:436
nonfactor_h2013totals$last_name[6] <- "Young"
nonfactor_h2013totals$first_name[6] <- "Bill"
nonfactor_h2013totals$last_name[78] <- "Bonner"
nonfactor_h2013totals$first_name[78] <- "Jo"
nonfactor_h2013totals$last_name[200] <- "Emerson"
nonfactor_h2013totals$first_name[200] <- "Jo Ann"
final_2013_data <- nonfactor_h2013totals
#swing_last_names <- read.csv("swing_last_names.csv")
#swing_last_names <- as.vector(swing_last_names)
#final_2013_data[,"swing"] <- NA
#steve king
#final_2013_data$swing[153] <- TRUE
#swing district data from Daily Kos
swing_districts <- read.csv("swing_legislators - Sheet1.csv", header = FALSE)
names(swing_districts) <- c("district", "legislator", "party","obama12","romney12","obama08","mccain08")
swing_districts$diff12 <- swing_districts$obama12 - swing_districts$romney12
swing_actual <- subset(swing_districts, (swing_districts$diff12 > -5) & (swing_districts$diff12 < 5))
swing_actual <- swing_actual[,c(2,8)]
swing_actual$leg_nonfactor <- as.character(swing_actual$legislator)
swing_split <- strsplit(swing_actual$leg_nonfactor,",")
library(plyr)
swing_split_names <- ldply(swing_split)
names(swing_split_names) <- c("last", "first")
swing_actual$last <- swing_split_names$last
swing_actual$first <- swing_split_names$first
final_2013_working <- final_2013_data
final_2013_working$combined_name <- paste(final_2013_working$last_name, ","," ", final_2013_working$first_name, sep = "")
#swing-join section
#default is all legislators are not from swing, then we will mark the swing with TRUEs in the swing column
final_2013_working$swing <- FALSE
#where do the swing legislators show up in the missed vote data?
match_order <- match(swing_actual$leg_nonfactor, final_2013_working$combined_name)
# FIRST: tackle the ones that the matching worked
match_order_nona <- match_order[!is.na(match_order)]
#how many are there (to find how many times looping over)
length(match_order_nona)
for (i in 1:38){
final_2013_working[match_order_nona[i],"swing"] <- TRUE
}
# why does the above for loop work when final_2013_working$swing[match_order_nona,] <- TRUE not work
#SECOND: find the ones that did not work in matching
match(swing_actual$leg_nonfactor, final_2013_working$combined_name)
#indexes of NA values in match_order
non_matches <- which(is.na(match_order))
swing_actual[non_matches,]
#indexes of the non matches in final_2013_working, have to manually check these: 189, 234, 59, 303,mat 430, 352, 431, 419, 291, 136
subset(final_2013_working, final_2013_working$last_name == "McKeon") #checks out
subset(final_2013_working, final_2013_working$last_name == "Royce") #checks out
subset(final_2013_working, final_2013_working$last_name == "Rogers") #two mike rogers, first mike rogers (400342) is correct
subset(final_2013_working, final_2013_working$last_name == "Walz") #checks out
subset(final_2013_working, final_2013_working$last_name == "Heck") #checks out
subset(final_2013_working, final_2013_working$last_name == "Bishop") #checks out
subset(final_2013_working, final_2013_working$last_name == "Meehan") #checks out
subset(final_2013_working, final_2013_working$last_name == "Dent") #checks out
subset(final_2013_working, final_2013_working$last_name == "Rigell") #checks out
subset(final_2013_working, final_2013_working$last_name == "Forbes") #checks out
final_2013_working$swing[non_match_index] <- TRUE
colors <- rep("yellow", 436)
swing_numbers <- which(final_2013_working$swing == TRUE)
colors[swing_numbers] <- "cyan"
barplot(final_2013_working$missed_votes, col = colors)
#break it out by state
final_2013_working$id <- as.numeric(final_2013_working$id)
id_data$govtrack_id <- as.numeric(id_data$govtrack_id)
id_match_numbers <- match(final_2013_working$id, id_data$govtrack_id)
final_2013_working$state <- id_data$state[id_match_numbers]
#may or may not need to do the following, may have failed to load properly from earlier
final_2013_working$last_name <- as.character(final_2013_working$last_name)
final_2013_working$first_name <- as.character(final_2013_working$first_name)
final_2013_working$last_name[78] <- "Bonner"
final_2013_working$first_name[78] <- "Jo"
final_2013_working$last_name[200] <- "Emerson"
final_2013_working$first_name[200] <- "Jo Ann"
final_2013_working$last_name[205] <- "Alexander"
final_2013_working$first_name[205] <- "Rodney"
#fixing the NAs, Bill Young who died
final_2013_working$state[6] <- "FL"
#jo bonner, AL 1st
final_2013_working$state[78] <- "AL"
#jo ann emerson, MO 8th
final_2013_working$state[200] <- "MO"
#rodney alexander
[[code here]]
#trying out a function to port over the no match differences
differences <- function(last_names, indexes, data){
for (i in 1:9){
diff <- subset(swing_actual, swing_actual$last == last_names[i])[,"diff12"]
data[indexes[i],"diff12"] <- diff
}
}
#failing at function, manually inserting difference values for no matches
> subset(swing_actual, swing_actual$diff12 == "McKeon")
[1] legislator diff12 leg_nonfactor last first
<0 rows> (or 0-length row.names)
> subset(swing_actual, swing_actual$diff12 == "Royce")
[1] legislator diff12 leg_nonfactor last first
<0 rows> (or 0-length row.names)
> subset(swing_actual, swing_actual$last == "McKeon")
legislator diff12 leg_nonfactor last first
46 McKeon, Buck -1.9 McKeon, Buck McKeon Buck
> subset(swing_actual, swing_actual$last == "Royce")
legislator diff12 leg_nonfactor last first
60 Royce, Ed -3.7 Royce, Ed Royce Ed
> subset(swing_actual, swing_actual$last == "Rogers")
legislator diff12 leg_nonfactor last first
206 Rogers, Mike J. -3.1 Rogers, Mike J. Rogers Mike J.
> subset(swing_actual, swing_actual$last == "Walz")
legislator diff12 leg_nonfactor last first
213 Walz, Tim 1.4 Walz, Tim Walz Tim
> subset(swing_actual, swing_actual$last == "Heck")
legislator diff12 leg_nonfactor last first
270 Heck, Joe 0.8 Heck, Joe Heck Joe
> subset(swing_actual, swing_actual$last == "Bishop")
legislator diff12 leg_nonfactor last first
272 Bishop, Tim 0.5 Bishop, Tim Bishop Tim
> subset(swing_actual, swing_actual$last == "Meehan")
legislator diff12 leg_nonfactor last first
331 Meehan, Pat -1.9 Meehan, Pat Meehan Pat
> subset(swing_actual, swing_actual$last == "Dent")
legislator diff12 leg_nonfactor last first
339 Dent, Charlie -2.9 Dent, Charlie Dent Charlie
> subset(swing_actual, swing_actual$last == "Rigell")
legislator diff12 leg_nonfactor last first
403 Rigell, Scott 1.5 Rigell, Scott Rigell Scott
> subset(swing_actual, swing_actual$last == "Forbes")
legislator diff12 leg_nonfactor last first
405 Forbes, Randy -1.3 Forbes, Randy Forbes Randy
diff_numbers <- c(-1.9, -3.7, -3.1, 1.4, .8, .5, -1.9, -2.9, 1.5, -1.3)
final_nonmatch_index <- c(189, 234, 59, 303, 430, 352, 431, 419, 291, 136)
final_2013_working[final_nonmatch_index, ]$diff12 <- diff_numbers
library(ggplot2)
ggplot(final_2013_working, aes(x=abs(final_2013_working$diff12), y=final_2013_working$missed_votes)) + geom_point(shape=1) + geom_smooth(method=lm)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment