Skip to content

Instantly share code, notes, and snippets.

@benjaminrobinson
Last active December 19, 2016 08:03
Show Gist options
  • Save benjaminrobinson/22eb9ad99ef4b9a952fe1060c1f49393 to your computer and use it in GitHub Desktop.
Save benjaminrobinson/22eb9ad99ef4b9a952fe1060c1f49393 to your computer and use it in GitHub Desktop.
#LINK TO DATA SHEETS HERE: http://dchr.dc.gov/public-employee-salary-information
#USE TABULIZER
options(stringsAsFactors = FALSE)
options(scipen = 10)
library(devtools)
install_github("ropenscilabs/tabulizerjars")
install_github("ropenscilabs/tabulizer")
library(tabulizer)
library(rvest)
library(dplyr)
#AUTOMATE SELECTION OF PDF LINKS? USING RVEST
urls <- read_html("http://dchr.dc.gov/public-employee-salary-information") %>%
html_nodes("a") %>%
html_attr("href")
#REMOVE NON-PDF LINKS
urls <- urls[grep("pdf",urls)]
#MOVE MOST RECENT PDF TO FIRST IN THE COUNT
urls <- c(urls[20],urls[1:19])
#CREATE FUNCTION TO PROVIDE QUARTERLY DATES FOR DATA SUBMISSIONS
disclose_date <- function(year,quarter){
x <- as.Date(
paste0(year,
ifelse(quarter==1,"-03-31",
ifelse(quarter==2,"-06-30",
ifelse(quarter==3,"-09-30","-12-31")))))
return(x)
}
#GET DATES FROM 2011 TO 2016
dates <- lapply(2016:2011,disclose_date,4:1)
#SUBSET OUT ALL BUT 4TH QUARTER FOR 2011 AND KEEP ALL BUT 4TH QTR FOR 2016 FOR NOW
dates[[6]] <- dates[[6]][[1]]
dates[[1]] <- subset(dates[[1]],!(dates[[1]] %in% dates[[1]][[1]]))
dates <- unlist(dates)
#CREATE META DATA OF LINKS AND URLS
meta <- cbind(dates,urls) %>%
as.data.frame %>%
mutate(dates=as.Date(as.numeric(dates),"1970-01-01"),dates=as.character(dates))
rm(urls,dates)
#TEST TO SEE IF THE FIRST PAGE OF PDF SCRAPING FAILS...
test_page <- function(x) {
if(nrow(extract_tables(meta$urls[x],pages=1,method="data.frame")[[1]])==0){
return("Failed")
stop()
} else {
print("Passed")
}
}
#RUNNING THE DATA TAKES A DECENT AMOUNT OF TIME
#PARSE PDF! TIME IT TOO!
#x=
parse_pdf <- function(x) {
begin <- Sys.time()
if(test_page(x)=="Passed"){
dchr <- extract_tables(meta$urls[x],pages=1:get_n_pages(meta$urls[x]),method="data.frame")
} else {
dchr <- extract_tables(meta$urls[x],pages=2:get_n_pages(meta$urls[x]),method="data.frame")
}
print(Sys.time()-begin)
return(cbind(dchr,Reporting.Date=meta$dates[x]))
}
##ONLY FIRST SHEET HAS VARIABLE NAMES. OTHER SHEETS READ THE FIRST ROW AS VARIABLE NAMES WHEN THEY ARE NOT.
for(a in 2:length(dchr)){
dchr[[a]] <- rbind(dchr[[a]],names(dchr[[a]]))
names(dchr[[a]]) <- names(dchr[[1]])
}
salary <- do.call(rbind,dchr) %>%
mutate(Export.Date=meta$dates[x],
Hire.Date=as.Date(Hire.Date,"%m/%d/%Y"),
Annual.Rate=sub("([.-])|[[:punct:]]", "\\1",Annual.Rate),
Annual.Rate=as.numeric(Annual.Rate)) %>%
sapply(.,trimws) %>%
as.data.frame
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment