Skip to content

Instantly share code, notes, and snippets.

@jtrecenti
Created December 1, 2023 23:04
Show Gist options
  • Save jtrecenti/c8ca0b3096104905bdf361f35c78966e to your computer and use it in GitHub Desktop.
Save jtrecenti/c8ca0b3096104905bdf361f35c78966e to your computer and use it in GitHub Desktop.
pdf_full <- pdftools::pdf_text("pdf_full.pdf")
# 33-36, 38-39, 41-46, 48-49, 52-54, 56-61, 63-72, 74-77, 79-83, 85-91, 93-96, 98-100, 102-105
paginas <- c(
33:36, 38:39, 41:46, 48:49,
52:54, 56:61, 63:72, 74:77,
79:83, 85:91, 93:96, 98:100,
102:105
)
pegar_infos_pagina <- function(txt_pag, pag) {
usethis::ui_info("Pegando pagina {pag}...")
f <- glue::glue("saidas_paginas/{pag}.json")
if (!fs::file_exists(f)) {
result <- openai::create_chat_completion(
model = "gpt-4-1106-preview",
messages = list(
list(
role = "system",
content = readr::read_file("prompt.md")
),
list(
role = "user",
content = txt_pag
)
),
temperature = 0
)
txt_resp <- result$choices[["message.content"]] |>
stringr::str_remove_all("```json\n|\n```")
readr::write_file(txt_resp, f)
} else {
txt_resp <- readr::read_file(f)
}
txt_resp |>
jsonlite::fromJSON() |>
tibble::as_tibble()
}
safe <- purrr::possibly(pegar_infos_pagina, tibble::tibble(erro = "erro"))
da_pessoas <- pdf_full[paginas] |>
purrr::set_names(paginas) |>
purrr::imap(safe) |>
purrr::list_rbind(names_to = "pagina")
da_pessoas |>
dplyr::count(pagina, name = "aff") |>
dplyr::filter(aff != 16) |>
print(n = 100)
da_pessoas |>
writexl::write_xlsx("contatos_pessoas_pdf.xlsx")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment