Skip to content

Instantly share code, notes, and snippets.

@ddotta
Last active July 19, 2024 07:14
Show Gist options
  • Save ddotta/8e828145355bb87e78d83191b747b2e0 to your computer and use it in GitHub Desktop.
Save ddotta/8e828145355bb87e78d83191b747b2e0 to your computer and use it in GitHub Desktop.
Script for scraping pdf files with checkboxes (response to https://github.com/ropensci/tabulapdf/issues/165)
# Function to identify checkbox patterns
find_checkboxes <- function(text) {
# Example of pattern: "☒" or "☑" or any other symbol used in the PDF
checkboxes <- grep("✘", text, value = TRUE)
return(checkboxes)
}
create_bdc_tables <- function(text) {
# Split the text into lines
lines <- unlist(strsplit(text, "\n"))
# Remove empty lines and lines not needed for authorizations
lines <- lines[lines != ""][-c(1:6)]
# Initialize lists to store the data
base_de_conjoncture <- c()
manager <- c()
consultant <- c()
# Loop through each line to extract information
for (line in lines) {
# Use spaces to align columns
fields <- unlist(strsplit(line, "\\s{2,}"))
base_de_conjoncture <- c(base_de_conjoncture, fields[2])
manager <- c(manager, fields[3])
consultant <- c(consultant, fields[4])
}
# Create the data frame
df <- data.frame(
Base_de_conjoncture = base_de_conjoncture,
Manager = manager,
Consultant = consultant
)
# Replace ✘ with TRUE (FALSE otherwise)
df$Manager <- grepl("✘", df$Manager)
df$Consultant <- grepl("✘", df$Consultant)
return(df)
}
create_sources_tables <- function(text, part) {
# Split the text into lines
lines <- unlist(strsplit(text, "\n"))
# Remove empty lines and lines not needed for authorizations
lines <- lines[lines != ""]
# For part 1 of the table, remove the first 3 lines
if (part == "1") {
lines <- lines[-c(1:3)]
}
# Initialize lists to store the data
section <- c()
source <- c()
producer <- c()
study_manager <- c()
# Loop through each line to extract information
for (line in lines) {
# Use spaces to align columns
fields <- unlist(strsplit(line, "\\s{2,}"))
section <- c(section, fields[1])
source <- c(source, fields[2])
producer <- c(producer, fields[4])
study_manager <- c(study_manager, fields[3])
}
# Create the data frame
df <- data.frame(
Section = section,
Source = source,
Producer = producer,
Study_manager = study_manager
)
# Replace ✘ with TRUE (FALSE otherwise)
df$Producer <- grepl("✘", df$Producer)
df$Study_manager <- grepl("✘", df$Study_manager)
# For part 2, ensure the Section column is filled on the first line
if (part == "2") {
df[1,"Section"] <- "060_Productions_meat_eggs"
}
df <- df %>%
# Replace empty strings with NA
mutate(Section = na_if(Section, "")) %>%
# Use fill to complete missing values
fill(Section, .direction = "down") %>%
# Remove unnecessary lines
filter(!is.na(Source)) %>%
# Concatenate Section and Source columns
unite("Source_path", Section:Source, sep = "/")
return(df)
}
### Script for scraping authorization pdf files
library(dplyr)
library(tidyr)
library(pdftools)
source(file = "fonctions.R")
file <- "E_MONTLOUIS_20240704_SSP_NS_pj_Formulaire_mouvement_SSM_Agriculture_VF.pdf"
pdf_text <- pdftools::pdf_text(file)
# Apply the function to each page of the PDF
checkboxes <- lapply(pdf_text, find_checkboxes)
# The tables containing the authorizations are checkboxes[[3]], checkboxes[[4]], and checkboxes[[5]]
authorization_checkboxes <- checkboxes[c(3, 4, 5)]
bdc_table <- create_bdc_tables(authorization_checkboxes[[1]])
sources_table_part1 <- create_sources_tables(authorization_checkboxes[[2]], part = "1")
sources_table_part2 <- create_sources_tables(authorization_checkboxes[[3]], part = "2")
sources_table <- bind_rows(sources_table_part1, sources_table_part2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment