phrmendes/get_from_pdf.R

## get_from_pdf.R
# packages ----

packages <- c("curl", "glue", "pdftools", "stringr", "tibble", "purrr", "tesseract")

install.packages(packages)

invisible(lapply(packages, require, character.only = TRUE))

# function ----

get_from_pdf <- function(x) {

  regex_cnpj <- "[0-9]{2}.[0-9]{3}.[0-9]{3}/[0-9]{4}-[0-9]{2}"

  regex_cep <- "[0-9]{5}-[0-9]{3}"

  temp <- tempfile()

  pdf <- curl::curl_download(
    x,
    glue::glue("{temp}.pdf")
  )

  txt <- pdftools::pdf_ocr_text(
    pdf,
    dpi = 600,
    language = "por",
    pages = n # p
  ) |>
    stringr::str_flatten()

  df <- tibble::tibble(
    cep = stringr::str_extract(
      txt,
      pattern = regex(paste0(regex_cep))
    ),
    cnpj = stringr::str_extract(
      txt,
      pattern = regex(paste0(regex_cnpj))
    )
  )

  return(df)
}

# request ----

urls <- c("url.com/x.pdf", "url.com/y.pdf")

df <- purrr::map_dfr(
  urls,
  ~ get_from_pdf(.x)
)
	# packages ----

	packages <- c("curl", "glue", "pdftools", "stringr", "tibble", "purrr", "tesseract")

	install.packages(packages)

	invisible(lapply(packages, require, character.only = TRUE))

	# function ----

	get_from_pdf <- function(x) {

	regex_cnpj <- "[0-9]{2}.[0-9]{3}.[0-9]{3}/[0-9]{4}-[0-9]{2}"

	regex_cep <- "[0-9]{5}-[0-9]{3}"

	temp <- tempfile()

	pdf <- curl::curl_download(
	x,
	glue::glue("{temp}.pdf")
	)

	txt <- pdftools::pdf_ocr_text(
	pdf,
	dpi = 600,
	language = "por",
	pages = n # p
	) \|>
	stringr::str_flatten()

	df <- tibble::tibble(
	cep = stringr::str_extract(
	txt,
	pattern = regex(paste0(regex_cep))
	),
	cnpj = stringr::str_extract(
	txt,
	pattern = regex(paste0(regex_cnpj))
	)
	)

	return(df)
	}

	# request ----

	urls <- c("url.com/x.pdf", "url.com/y.pdf")

	df <- purrr::map_dfr(
	urls,
	~ get_from_pdf(.x)
	)