Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save NewGraphEnvironment/a9da0243c48ad352b2bb2ff44184cc1d to your computer and use it in GitHub Desktop.
Save NewGraphEnvironment/a9da0243c48ad352b2bb2ff44184cc1d to your computer and use it in GitHub Desktop.
chattr token breakout and file creation wildfire playbook review
# Load required libraries
{
library(pdftools)
library(tm)
library(stringr)
library(chattr)
library(tidyverse)
}
chattr('which model are you and what are you up to? Please explain in less than 100 words', stream = FALSE)
# Define the path to your PDF
pdf_path <- "/Users/airvine/zotero/storage/B67TPQIS/guenther_2023_playbook_to_guide_landscape_recovery_strategies_&_priorities_for_salmon_habitat.pdf"
# Convert PDF to text
pdf_text <- pdf_text(pdf_path)
# Concatenate all pages into a single string
pdf_string_raw <- paste(pdf_text, collapse = " ")
# NOT IMPLEMENTED BUT could remove the table of contents by keeping everything after the last occurence of at least 10 periods
cut_string <- function(s) {
matches <- str_locate_all(s, "\\.{10,}")
if (length(matches[[1]]) > 0) {
last_match <- tail(matches[[1]], 1)
return(str_sub(s, start = last_match[2] + 1))
} else {
return(s)
}
}
# clean the string
pdf_string <- pdf_string_raw %>%
str_replace_all("\n", " ") %>%
str_replace_all(" {3,99}", "") %>% #blank space
str_replace_all("EDI ENVIRONMENTAL DYNAMICS INC.", " ") %>%
str_remove_all("DRAFT\\d+") %>%
str_remove_all("Playbook to Guide Landscape Recovery Strategies & Priorities for SalmonHabitat Following Major Wildfires") %>%
str_replace_all("DRAFT [1-9][0-9]{0,2}", "") #Remove "DRAFT" followed by a space and a number between 1 and 300
pdf_string2 <- cut_string(pdf_string)
# Define the maximum chunk size.
token_max_size <- 10000
token_response_size <- 7000
token_prompt_size <- token_max_size - token_response_size
token_chars_per_word <- 5
max_chunk_size <- token_prompt_size * token_chars_per_word
max_words_chunk <- max_chunk_size / token_chars_per_word
split_into_chunks <- function(text, chunk_size) {
# Split the text at nearest report section break based on the presence of a number followed by at least 3 capital letters
potential_sections <- str_split(text, "(?<=\\s)(?=\\d+[A-Z]{3,})", simplify = TRUE)
# Initialize an empty list to store the final chunks
chunks <- list()
# Loop over the potential sections
for (section in potential_sections) {
# If the section is too large, split it into smaller chunks
if (str_length(section) > chunk_size) {
n <- str_length(section)
n_chunks <- ceiling(n / chunk_size)
section_chunks <- map_chr(seq_len(n_chunks), ~str_sub(section, (.-1)*chunk_size+1, . * chunk_size))
chunks <- c(chunks, section_chunks)
} else {
# Otherwise, add the section to the list of chunks
chunks <- c(chunks, section)
}
}
# Convert the list of chunks to a character vector
chunks <- unlist(chunks)
return(chunks)
}
# function to count words per chunk
count_words_punct <- function(sentence) {
word_count <- str_count(sentence, "\\w+")
punct_count <- str_count(sentence, "[[:punct:]]")
total_count <- word_count + punct_count
return(total_count)
}
pdf_chunks <- split_into_chunks(pdf_string, max_chunk_size) %>%
discard(~all(is.na(.))) # Remove empty chunks - pdf_chunks[nzchar(pdf_chunks)] works too
# calculate the number of words per chunk using purrr
token_counts <- map_dbl(pdf_chunks, count_words_punct)
loop_chat <- function(input = NULL,
wait_time = 15, #we don't want to time out the API calls/minute
prompt_prep = 'please summarize the main points of this: ',
stream = FALSE,
...) {
# Create an empty vector to store the results
results <- c()
for (i in seq_along(input)) {
# Make the API call and store the result
results[i] <- chattr(paste0(prompt_prep, input[i]), stream = stream, ...)
# Wait before the next iteration to not exceed the API rate limit/max number of tokens = 10000
if(i < length(input)) {
Sys.sleep(wait_time)
}
}
# Return the results vector
return(results)
}
sort(token_counts, decreasing = TRUE)[1:3]
# Get the indices of the sorted vector
sorted_indices <- order(token_counts, decreasing = TRUE)
# Get the indices of the top 3 elements
chunks_largest <- sorted_indices[1:3]
# find where the TOC ends so we can omit
pdf_chunks[16]
toc_ends <- 16
# source the summary3.R file made by three iterations of the entire report
source("~/Projects/repo/wildfire_summary3.R")
# build the prompt
prompt_built <- paste0(
'please summarize the main themes of this, remove redundancies, do not include any quote characters (single or double)',
'in your response and include only level 1 headers only using markdown syntax in your response. ',
'do not make a level 1 header called Summary - we want the actual level 1 headers from the etext you are reading. Level 1 is single digit ',
'followed by CAPITAL title ex. 1 INTRODUCTION and 2 WILDFIRE REOVERY PLANNING AND DECISION MAKING. We dont want level 2 headers please ex 6.2 DECISION-MAKING SUPPORRT TOOLS.',
'Chunks dont necessarily start with a level 1 header. Thats ok. IMPORTANT - After this first colon in this prompt a quick summary of the whole document. You are not summarizeing the text after ',
'the colon until after a sequence of three colons and a space so please wait till you read the whole thing!! Before three colons and a space is only provided for ',
'context! You are getting the document in 25 chunks passed in a loop so remember that you will only need pieces of this summary of ',
'context that will be followed by three colons and a space in each response. All the text summarizd in this prompt will not ',
'be present in the text you are to summarize and that is ok. Summarize text after the three colons ONLY. Here is more context: ',
summary3,
'Now ITS GO TIME - SUMMARIZE THIS::: ')
# if we kept it real simple we could have called the loop_chat function to get an object
# t <- loop_chat(input = pdf_chunks[toc_ends + 1:length(pdf_chunks)],
# prompt_prep = prompt_built)
# had a failure - need to find the chunk that failed
# string_to_find <- "Silvicultural/Hydrological" #second last chunk
# matches <- str_detect(pdf_chunks, fixed(string_to_find))
# which(matches)
# #[1] 51 55 64
# make a function that burns the output to an rmarkdown file that can be rendered or fed back as a prompt
write_to_file <- function(chunks_send = NULL,
output_file = "output.Rmd",
prompt_to_send = prompt_built,
wait_return = 15,
...){
writeLines(paste0("```{r include=FALSE} \n prompt_engineered <- c("), output_file)
# Use purrr::map() to iterate over pdf_chunks
purrr::map(chunks_send, ~{
con <- file(output_file, open = "a")
sink(con, append = TRUE)
result <- loop_chat(input = .x,
prompt_prep = prompt_to_send,
...)
# If result is not a character string, convert it
if(!is.character(result)) { result <- as.character(result) }
# Add a comma after each result except the last one and put result in quotes with shQuote
if(!is.na(.x) && !is.na(tail(chunks_send, 1)[[1]]) && .x != tail(chunks_send, 1)[[1]]) {
result <- paste0(shQuote(result, type = "cmd"), ",")
}
# After the last entry, add a closing bracket and three backticks
if(identical(.x , tail(chunks_send, 1)[[1]])) {
result <- paste0(shQuote(result, type = "cmd"), ")\n```\n`r prompt_engineered`")
}
cat(result, file = output_file, append = TRUE)
sink()
close(con)
# Pause for a bit to not go over API rate limits
if(identical(.x, tail(.x, n = 1))){
Sys.sleep(wait_return)
}
},
.progress = list(
type = "iterator",
format = "Writing to file {cli::pb_bar} {cli::pb_percent}",
clear = TRUE))
}
# test sending it all at onece to gpt-4-0125-preview
# I'm sorry, but I cannot provide a summary for the document as it exceeds my current capabilities to process and
# summarize such extensive and complex content. However, I can help answer questions or provide information on specific topics related to watershed recovery, salmon habitat restoration, or any other related subject. Please let me know how I can assist you further."
# write_to_file(chunks_send = pdf_string2,
# output_file = "test.Rmd",
# prompt_to_send = "please summarize the main themes of this, remove redundancies, do not include any quote characters (single or double): ",
# wait_time = 0 #this passes to loop_chat and can be set to zero because we needed wait_return for some reason
# )
# Call the function in a test
write_to_file(chunks_send = pdf_chunks[c(toc_ends + 1,toc_ends + 2)],
output_file = "test.Rmd",
prompt_to_send = "summarize in 100 words or less: ",
wait_time = 0 #this passes to loop_chat and can be set to zero because we needed wait_return for some reason
)
# call the whole thing
write_to_file(chunks_send = pdf_chunks,
output_file = "output_gpt-4-0125-preview.Rmd",
wait_time = 0 #this passes to loop_chat and can be set to zero because we needed wait_return for some reason
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment