Skip to content

Instantly share code, notes, and snippets.

@Tadge-Analytics
Created June 20, 2020 04:17
Show Gist options
  • Save Tadge-Analytics/f09a6f867f0f8d155690e53126a4b4af to your computer and use it in GitHub Desktop.
Save Tadge-Analytics/f09a6f867f0f8d155690e53126a4b4af to your computer and use it in GitHub Desktop.
library(tidyverse)
library(pdftools)
library(httr)
library(openxlsx)
url <- "http://2020.erum.io/wp-content/uploads/2020/06/program_brochure_v5_20200617.pdf"
GET(url, write_disk(tf <- tempfile(fileext = ".pdf")))
import <- pdf_text(tf)
pages_to_keep <- c(11L, 12L, 13L, 15L, 16L, 19L, 20L, 22L, 23L, 24L, 25L, 28L, 29L, 31L, 32L, 33L, 34L)
x <- import %>%
.[pages_to_keep] %>%
tibble(pages = .) %>%
mutate(page_num = pages_to_keep,
tidied = map(pages, ~.x %>%
str_split("\r\n") %>%
map(~str_squish(.))%>%
unlist() %>%
tibble(content= .))) %>%
select(-pages) %>%
unnest(tidied) %>%
filter(content != "") %>%
filter(str_detect(content, ":")) %>%
separate(content, into = c("first", "second"), extra = "merge", remove = FALSE, sep = " ") %>%
filter(str_detect(first, ":|Session")) %>%
filter(!str_detect(first, "-")) %>%
mutate(part_num = word(second, 1, sep = " "),
part_time = word(second, 2, sep = " ")) %>%
separate(second, into = c(NA, NA, "part_content"), extra = "merge", sep = " ", remove = FALSE) %>%
mutate(second = if_else(first == "Session", part_content, second),
first = if_else(first == "Session", part_time, first)) %>%
select(-contains("part_"), -content) %>%
print()
# create the Excel file
wb <- createWorkbook()
addWorksheet(wb, "output data")
x %>%
writeDataTable(wb, "output data", ., tableStyle = "TableStyleMedium2", withFilter = T)
# compile the file... for the download handler.
saveWorkbook(wb, "data.xlsx", overwrite = TRUE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment