Skip to content

Instantly share code, notes, and snippets.

@andrewbtran
Created August 31, 2018 19:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andrewbtran/79268a81adce99c081b2cf32ae8fe2c7 to your computer and use it in GitHub Desktop.
Save andrewbtran/79268a81adce99c081b2cf32ae8fe2c7 to your computer and use it in GitHub Desktop.
Scraping WH financial disclosures
library(tidyverse)
library(rvest)
page <- xml2::read_html("https://extapps2.oge.gov/201/Presiden.nsf/PAS%20Filings%20by%20Date?OpenView")
tbl <- html_table(page)[[1]]
tbl <- as_tibble(tbl)
names(tbl) <- c("date", "report", "name", "department", "position")
tbl
tbl$url <- page %>%
html_nodes("#index tr") %>%
html_node("a") %>%
html_attr("href")
tbl %>% count(report, sort = TRUE) %>% View()
tbl %>% filter(str_detect(report, "Annual")) %>% count(report, sort = TRUE)
"Annual (2018)"
"Annual (2018) "
annual_2018 <- tbl %>% filter(report == "Annual (2018)")
url <- annual_2018$url
path <- paste0("pdfs/", basename(url))
dir.create("pdfs")
walk2(url, path, download.file)
# list int
# 1 argument map
# 2 arguments map2
# n arguments pmap
library(tabulizer)
path <- "pdfs/Alexander-Acosta-2018-278.pdf"
tables <- tabulizer::extract_tables(path)
lines <- tables[[1]]
acosta <- as_tibble(lines)
names(acosta) <- c("org", "type", "position", "from", "to")
line_table_1 <- str_which(acosta$org, "1. Filer's Position")
line_table_2 <- str_which(acosta$org, "2. Filer's Employment")
acosta_position <- acosta[(line_table_1 + 3):(line_table_2 - 2), ]
acosta_position
acosta_position %>%
mutate(carryover = (to == "")) %>%
mutate(type2 = ifelse(carryover, type, NA)) %>%
fill(type2, .direction = "up") %>%
mutate(type3 = paste0(type, type2)) %>%
filter(!carryover) %>%
select(-type, type2)
# Basic idea for scaling up -----------------------------------------------
table1 <- function(lines) {
acosta <- as_tibble(lines)
names(acosta) <- c("org", "type", "position", "from", "to")
line_table_1 <- str_which(acosta$org, "1. Filer's Position")
line_table_2 <- str_which(acosta$org, "2. Filer's Employment")
acosta_position <- acosta[(line_table_1 + 3):(line_table_2 - 2), ]
acosta_position
acosta_position %>%
mutate(carryover = (to == "")) %>%
mutate(type2 = ifelse(carryover, type, NA)) %>%
fill(type2, .direction = "up") %>%
mutate(type3 = paste0(type, type2)) %>%
filter(!carryover) %>%
select(-type, type2)
}
paths <- dir("pdfs/", full.names = TRUE)
pdfs <- paths %>%
map(tabulizer::extract_tables)
%>%
map_dfr(table1)
tables <- (path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment