ddotta/fonctions.R

## fonctions.R
# Function to identify checkbox patterns
find_checkboxes <- function(text) {
  # Example of pattern: "☒" or "☑" or any other symbol used in the PDF
  checkboxes <- grep("✘", text, value = TRUE)
  return(checkboxes)
}

create_bdc_tables <- function(text) {

  # Split the text into lines
  lines <- unlist(strsplit(text, "\n"))

  # Remove empty lines and lines not needed for authorizations
  lines <- lines[lines != ""][-c(1:6)]

  # Initialize lists to store the data
  base_de_conjoncture <- c()
  manager <- c()
  consultant <- c()

  # Loop through each line to extract information
  for (line in lines) {

    # Use spaces to align columns
    fields <- unlist(strsplit(line, "\\s{2,}"))

    base_de_conjoncture <- c(base_de_conjoncture, fields[2])
    manager <- c(manager, fields[3])
    consultant <- c(consultant, fields[4])

  }

  # Create the data frame
  df <- data.frame(
    Base_de_conjoncture = base_de_conjoncture,
    Manager = manager,
    Consultant = consultant
  )

  # Replace ✘ with TRUE (FALSE otherwise)
  df$Manager <- grepl("✘", df$Manager)
  df$Consultant <- grepl("✘", df$Consultant)

  return(df)
}

create_sources_tables <- function(text, part) {

  # Split the text into lines
  lines <- unlist(strsplit(text, "\n"))

  # Remove empty lines and lines not needed for authorizations
  lines <- lines[lines != ""]

  # For part 1 of the table, remove the first 3 lines
  if (part == "1") {
    lines <- lines[-c(1:3)]
  }

  # Initialize lists to store the data
  section <- c()
  source <- c()
  producer <- c()
  study_manager <- c()

  # Loop through each line to extract information
  for (line in lines) {

    # Use spaces to align columns
    fields <- unlist(strsplit(line, "\\s{2,}"))

    section <- c(section, fields[1])
    source <- c(source, fields[2])
    producer <- c(producer, fields[4])
    study_manager <- c(study_manager, fields[3])

  }

  # Create the data frame
  df <- data.frame(
    Section = section,
    Source = source,
    Producer = producer,
    Study_manager = study_manager
  )

  # Replace ✘ with TRUE (FALSE otherwise)
  df$Producer <- grepl("✘", df$Producer)
  df$Study_manager <- grepl("✘", df$Study_manager)

  # For part 2, ensure the Section column is filled on the first line
  if (part == "2") {
    df[1,"Section"] <- "060_Productions_meat_eggs"
  }

  df <- df %>%
    # Replace empty strings with NA
    mutate(Section = na_if(Section, "")) %>%
    # Use fill to complete missing values
    fill(Section, .direction = "down") %>%
    # Remove unnecessary lines
    filter(!is.na(Source)) %>%
    # Concatenate Section and Source columns
    unite("Source_path", Section:Source, sep = "/")

  return(df)
}

## scraping_pdf_tables
### Script for scraping authorization pdf files

library(dplyr)
library(tidyr)
library(pdftools)

source(file = "fonctions.R")

file <- "E_MONTLOUIS_20240704_SSP_NS_pj_Formulaire_mouvement_SSM_Agriculture_VF.pdf"
pdf_text <- pdftools::pdf_text(file)

# Apply the function to each page of the PDF
checkboxes <- lapply(pdf_text, find_checkboxes)

# The tables containing the authorizations are checkboxes[[3]], checkboxes[[4]], and checkboxes[[5]]
authorization_checkboxes <- checkboxes[c(3, 4, 5)]

bdc_table <- create_bdc_tables(authorization_checkboxes[[1]])

sources_table_part1 <- create_sources_tables(authorization_checkboxes[[2]], part = "1")
sources_table_part2 <- create_sources_tables(authorization_checkboxes[[3]], part = "2")
sources_table <- bind_rows(sources_table_part1, sources_table_part2)
	# Function to identify checkbox patterns
	find_checkboxes <- function(text) {
	# Example of pattern: "☒" or "☑" or any other symbol used in the PDF
	checkboxes <- grep("✘", text, value = TRUE)
	return(checkboxes)
	}

	create_bdc_tables <- function(text) {

	# Split the text into lines
	lines <- unlist(strsplit(text, "\n"))

	# Remove empty lines and lines not needed for authorizations
	lines <- lines[lines != ""][-c(1:6)]

	# Initialize lists to store the data
	base_de_conjoncture <- c()
	manager <- c()
	consultant <- c()

	# Loop through each line to extract information
	for (line in lines) {

	# Use spaces to align columns
	fields <- unlist(strsplit(line, "\\s{2,}"))

	base_de_conjoncture <- c(base_de_conjoncture, fields[2])
	manager <- c(manager, fields[3])
	consultant <- c(consultant, fields[4])

	}

	# Create the data frame
	df <- data.frame(
	Base_de_conjoncture = base_de_conjoncture,
	Manager = manager,
	Consultant = consultant
	)

	# Replace ✘ with TRUE (FALSE otherwise)
	df$Manager <- grepl("✘", df$Manager)
	df$Consultant <- grepl("✘", df$Consultant)

	return(df)
	}

	create_sources_tables <- function(text, part) {

	# Split the text into lines
	lines <- unlist(strsplit(text, "\n"))

	# Remove empty lines and lines not needed for authorizations
	lines <- lines[lines != ""]

	# For part 1 of the table, remove the first 3 lines
	if (part == "1") {
	lines <- lines[-c(1:3)]
	}

	# Initialize lists to store the data
	section <- c()
	source <- c()
	producer <- c()
	study_manager <- c()

	# Loop through each line to extract information
	for (line in lines) {

	# Use spaces to align columns
	fields <- unlist(strsplit(line, "\\s{2,}"))

	section <- c(section, fields[1])
	source <- c(source, fields[2])
	producer <- c(producer, fields[4])
	study_manager <- c(study_manager, fields[3])

	}

	# Create the data frame
	df <- data.frame(
	Section = section,
	Source = source,
	Producer = producer,
	Study_manager = study_manager
	)

	# Replace ✘ with TRUE (FALSE otherwise)
	df$Producer <- grepl("✘", df$Producer)
	df$Study_manager <- grepl("✘", df$Study_manager)

	# For part 2, ensure the Section column is filled on the first line
	if (part == "2") {
	df[1,"Section"] <- "060_Productions_meat_eggs"
	}

	df <- df %>%
	# Replace empty strings with NA
	mutate(Section = na_if(Section, "")) %>%
	# Use fill to complete missing values
	fill(Section, .direction = "down") %>%
	# Remove unnecessary lines
	filter(!is.na(Source)) %>%
	# Concatenate Section and Source columns
	unite("Source_path", Section:Source, sep = "/")

	return(df)
	}
	### Script for scraping authorization pdf files

	library(dplyr)
	library(tidyr)
	library(pdftools)

	source(file = "fonctions.R")

	file <- "E_MONTLOUIS_20240704_SSP_NS_pj_Formulaire_mouvement_SSM_Agriculture_VF.pdf"
	pdf_text <- pdftools::pdf_text(file)

	# Apply the function to each page of the PDF
	checkboxes <- lapply(pdf_text, find_checkboxes)

	# The tables containing the authorizations are checkboxes[[3]], checkboxes[[4]], and checkboxes[[5]]
	authorization_checkboxes <- checkboxes[c(3, 4, 5)]

	bdc_table <- create_bdc_tables(authorization_checkboxes[[1]])

	sources_table_part1 <- create_sources_tables(authorization_checkboxes[[2]], part = "1")
	sources_table_part2 <- create_sources_tables(authorization_checkboxes[[3]], part = "2")
	sources_table <- bind_rows(sources_table_part1, sources_table_part2)