Skip to content

Instantly share code, notes, and snippets.

@christlc
Last active April 17, 2018 13:13
Show Gist options
  • Save christlc/c6cd2a2f3ff99b69ae2884abffd1513a to your computer and use it in GitHub Desktop.
Save christlc/c6cd2a2f3ff99b69ae2884abffd1513a to your computer and use it in GitHub Desktop.
Hong Kong Government budget extract from pdf - semi-auto solution
library(tabulizer)
#### PARAMETERS ####
target_str <- "An analysis of the financial provision under Subhead 000 Operational expenses is as follows"
filename <- "head156.pdf"
year <- 2012
####################
# locate page by key word
locate_page <- function(f, target_str){
target_page <- NULL
for(n in 1:tabulizer::get_n_pages(f)){
if(stringr::str_detect(extract_text(f, pages=as.numeric(n)), target_str)){
target_page <- n
break
}
}
target_page
}
# spring up shiny app
locate_via_shiny <- function(f, target_page){
column_loc_list <- list()
while(TRUE){
loc <- locate_areas(f, pages=target_page)
if(is.null(loc[[1]])){
break
}else{
column_loc_list[[length(column_loc_list)+1]] <- loc[[1]]
}
}
column_loc_list %>%
rbind_list() %>%
summarise(top = min(top),
left = min(left),
bottom = max(bottom),
right = max(right)
) -> area_boundaries
columns_boundaries <- column_loc_list %>%
rbind_list() %>%
select(right) %>% unlist()
list(columns_boundaries = list(columns_boundaries),
area_boundaries = list(area_boundaries %>% unlist))
}
# locate column(s)
extract_from_pdf <- function(f, target_str){
target_page <- locate_page(f, target_str)
if(is.null(target_page)) {
warning("target string not found")
return(NULL)
}
cat("Found keywords on page ", target_page)
boundaries <- locate_via_shiny(f, target_page)
tabulizer::extract_tables(f, pages=target_page,
area=boundaries$area_boundaries,
columns = boundaries$columns_boundaries, guess=FALSE,
output = "data.frame")[[1]] %>%
mutate_if(is.character, function(.)stringr::str_replace_all(., "\\.", ""))
}
download_and_extract <- function(year){
f <- file.path("pdf", year, filename)
dir.create(file.path("pdf", year), showWarnings = FALSE, recursive = FALSE)
download.file(paste0("https://www.budget.gov.hk/", year, "/eng/pdf/", filename), f)
extract_from_pdf(f, target_str)
}
# Download the data
f <- file.path("pdf", year, filename)
dir.create(file.path("pdf", year), showWarnings = FALSE, recursive = FALSE)
download.file(paste0("https://www.budget.gov.hk/", year, "/eng/pdf/", filename), f)
result <- extract_from_pdf(f, target_str)
lapply(2008:2009, download_and_extract) -> all_result
target_str <- "An analysis of the financial provision under Subhead 000 Operational expenses is as follows"
for(n in 1:tabulizer::get_n_pages(f)){
if(stringr::str_detect(extract_text(f, pages=as.numeric(n)), target_str)){
print(n)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment