Last active
July 19, 2024 07:14
-
-
Save ddotta/8e828145355bb87e78d83191b747b2e0 to your computer and use it in GitHub Desktop.
Script for scraping pdf files with checkboxes (response to https://github.com/ropensci/tabulapdf/issues/165)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Function to identify checkbox patterns | |
find_checkboxes <- function(text) { | |
# Example of pattern: "☒" or "☑" or any other symbol used in the PDF | |
checkboxes <- grep("✘", text, value = TRUE) | |
return(checkboxes) | |
} | |
create_bdc_tables <- function(text) { | |
# Split the text into lines | |
lines <- unlist(strsplit(text, "\n")) | |
# Remove empty lines and lines not needed for authorizations | |
lines <- lines[lines != ""][-c(1:6)] | |
# Initialize lists to store the data | |
base_de_conjoncture <- c() | |
manager <- c() | |
consultant <- c() | |
# Loop through each line to extract information | |
for (line in lines) { | |
# Use spaces to align columns | |
fields <- unlist(strsplit(line, "\\s{2,}")) | |
base_de_conjoncture <- c(base_de_conjoncture, fields[2]) | |
manager <- c(manager, fields[3]) | |
consultant <- c(consultant, fields[4]) | |
} | |
# Create the data frame | |
df <- data.frame( | |
Base_de_conjoncture = base_de_conjoncture, | |
Manager = manager, | |
Consultant = consultant | |
) | |
# Replace ✘ with TRUE (FALSE otherwise) | |
df$Manager <- grepl("✘", df$Manager) | |
df$Consultant <- grepl("✘", df$Consultant) | |
return(df) | |
} | |
create_sources_tables <- function(text, part) { | |
# Split the text into lines | |
lines <- unlist(strsplit(text, "\n")) | |
# Remove empty lines and lines not needed for authorizations | |
lines <- lines[lines != ""] | |
# For part 1 of the table, remove the first 3 lines | |
if (part == "1") { | |
lines <- lines[-c(1:3)] | |
} | |
# Initialize lists to store the data | |
section <- c() | |
source <- c() | |
producer <- c() | |
study_manager <- c() | |
# Loop through each line to extract information | |
for (line in lines) { | |
# Use spaces to align columns | |
fields <- unlist(strsplit(line, "\\s{2,}")) | |
section <- c(section, fields[1]) | |
source <- c(source, fields[2]) | |
producer <- c(producer, fields[4]) | |
study_manager <- c(study_manager, fields[3]) | |
} | |
# Create the data frame | |
df <- data.frame( | |
Section = section, | |
Source = source, | |
Producer = producer, | |
Study_manager = study_manager | |
) | |
# Replace ✘ with TRUE (FALSE otherwise) | |
df$Producer <- grepl("✘", df$Producer) | |
df$Study_manager <- grepl("✘", df$Study_manager) | |
# For part 2, ensure the Section column is filled on the first line | |
if (part == "2") { | |
df[1,"Section"] <- "060_Productions_meat_eggs" | |
} | |
df <- df %>% | |
# Replace empty strings with NA | |
mutate(Section = na_if(Section, "")) %>% | |
# Use fill to complete missing values | |
fill(Section, .direction = "down") %>% | |
# Remove unnecessary lines | |
filter(!is.na(Source)) %>% | |
# Concatenate Section and Source columns | |
unite("Source_path", Section:Source, sep = "/") | |
return(df) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Script for scraping authorization pdf files | |
library(dplyr) | |
library(tidyr) | |
library(pdftools) | |
source(file = "fonctions.R") | |
file <- "E_MONTLOUIS_20240704_SSP_NS_pj_Formulaire_mouvement_SSM_Agriculture_VF.pdf" | |
pdf_text <- pdftools::pdf_text(file) | |
# Apply the function to each page of the PDF | |
checkboxes <- lapply(pdf_text, find_checkboxes) | |
# The tables containing the authorizations are checkboxes[[3]], checkboxes[[4]], and checkboxes[[5]] | |
authorization_checkboxes <- checkboxes[c(3, 4, 5)] | |
bdc_table <- create_bdc_tables(authorization_checkboxes[[1]]) | |
sources_table_part1 <- create_sources_tables(authorization_checkboxes[[2]], part = "1") | |
sources_table_part2 <- create_sources_tables(authorization_checkboxes[[3]], part = "2") | |
sources_table <- bind_rows(sources_table_part1, sources_table_part2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment