christlc/extract_from_pdf.R

## extract_from_pdf.R
library(tabulizer)
#### PARAMETERS ####
target_str <- "An analysis of the financial provision under Subhead 000 Operational expenses is as follows"
filename <- "head156.pdf"
year <- 2012
####################


# locate page by key word
locate_page <- function(f, target_str){
  target_page <- NULL
  for(n in 1:tabulizer::get_n_pages(f)){
    if(stringr::str_detect(extract_text(f, pages=as.numeric(n)), target_str)){
      target_page <- n
      break
    }
  }
  target_page
}


# spring up shiny app
locate_via_shiny <- function(f, target_page){
  column_loc_list <- list()
  while(TRUE){
    loc <- locate_areas(f, pages=target_page)
    if(is.null(loc[[1]])){
      break
    }else{
      column_loc_list[[length(column_loc_list)+1]] <- loc[[1]]
    }
  }
  column_loc_list %>%
    rbind_list() %>%
    summarise(top = min(top),
              left = min(left),
              bottom = max(bottom),
              right = max(right)
    ) -> area_boundaries

  columns_boundaries <- column_loc_list %>%
    rbind_list() %>%
    select(right) %>% unlist()
  list(columns_boundaries = list(columns_boundaries),
       area_boundaries = list(area_boundaries %>% unlist))
}


# locate column(s)
extract_from_pdf <- function(f, target_str){
  target_page <- locate_page(f, target_str)
  if(is.null(target_page)) {
    warning("target string not found")
    return(NULL)
  }
  cat("Found keywords on page ", target_page)
  boundaries <- locate_via_shiny(f, target_page)
  tabulizer::extract_tables(f, pages=target_page,
                            area=boundaries$area_boundaries,
                            columns = boundaries$columns_boundaries, guess=FALSE,
                            output = "data.frame")[[1]] %>%
    mutate_if(is.character, function(.)stringr::str_replace_all(., "\\.", ""))
}


download_and_extract <- function(year){
  f <- file.path("pdf", year, filename)
  dir.create(file.path("pdf", year), showWarnings = FALSE, recursive = FALSE)
  download.file(paste0("https://www.budget.gov.hk/", year, "/eng/pdf/", filename), f)
  extract_from_pdf(f, target_str)
}


# Download the data

f <- file.path("pdf", year, filename)
dir.create(file.path("pdf", year), showWarnings = FALSE, recursive = FALSE)
download.file(paste0("https://www.budget.gov.hk/", year, "/eng/pdf/", filename), f)

result <- extract_from_pdf(f, target_str)

lapply(2008:2009, download_and_extract) -> all_result

## extract_keywords.R
target_str <- "An analysis of the financial provision under Subhead 000 Operational expenses is as follows"

for(n in 1:tabulizer::get_n_pages(f)){
  if(stringr::str_detect(extract_text(f, pages=as.numeric(n)), target_str)){
    print(n)
  }
}
	library(tabulizer)
	#### PARAMETERS ####
	target_str <- "An analysis of the financial provision under Subhead 000 Operational expenses is as follows"
	filename <- "head156.pdf"
	year <- 2012
	####################





	# locate page by key word
	locate_page <- function(f, target_str){
	target_page <- NULL
	for(n in 1:tabulizer::get_n_pages(f)){
	if(stringr::str_detect(extract_text(f, pages=as.numeric(n)), target_str)){
	target_page <- n
	break
	}
	}
	target_page
	}


	# spring up shiny app
	locate_via_shiny <- function(f, target_page){
	column_loc_list <- list()
	while(TRUE){
	loc <- locate_areas(f, pages=target_page)
	if(is.null(loc[[1]])){
	break
	}else{
	column_loc_list[[length(column_loc_list)+1]] <- loc[[1]]
	}
	}
	column_loc_list %>%
	rbind_list() %>%
	summarise(top = min(top),
	left = min(left),
	bottom = max(bottom),
	right = max(right)
	) -> area_boundaries

	columns_boundaries <- column_loc_list %>%
	rbind_list() %>%
	select(right) %>% unlist()
	list(columns_boundaries = list(columns_boundaries),
	area_boundaries = list(area_boundaries %>% unlist))
	}




	# locate column(s)
	extract_from_pdf <- function(f, target_str){
	target_page <- locate_page(f, target_str)
	if(is.null(target_page)) {
	warning("target string not found")
	return(NULL)
	}
	cat("Found keywords on page ", target_page)
	boundaries <- locate_via_shiny(f, target_page)
	tabulizer::extract_tables(f, pages=target_page,
	area=boundaries$area_boundaries,
	columns = boundaries$columns_boundaries, guess=FALSE,
	output = "data.frame")[[1]] %>%
	mutate_if(is.character, function(.)stringr::str_replace_all(., "\\.", ""))
	}


	download_and_extract <- function(year){
	f <- file.path("pdf", year, filename)
	dir.create(file.path("pdf", year), showWarnings = FALSE, recursive = FALSE)
	download.file(paste0("https://www.budget.gov.hk/", year, "/eng/pdf/", filename), f)
	extract_from_pdf(f, target_str)
	}



	# Download the data

	f <- file.path("pdf", year, filename)
	dir.create(file.path("pdf", year), showWarnings = FALSE, recursive = FALSE)
	download.file(paste0("https://www.budget.gov.hk/", year, "/eng/pdf/", filename), f)

	result <- extract_from_pdf(f, target_str)

	lapply(2008:2009, download_and_extract) -> all_result