NewGraphEnvironment/gist:a9da0243c48ad352b2bb2ff44184cc1d

## gistfile1.txt
# Load required libraries
{
  library(pdftools)
  library(tm)
  library(stringr)
  library(chattr)
  library(tidyverse)
}

chattr('which model are you and what are you up to? Please explain in less than 100 words', stream = FALSE)

# Define the path to your PDF
pdf_path <- "/Users/airvine/zotero/storage/B67TPQIS/guenther_2023_playbook_to_guide_landscape_recovery_strategies_&_priorities_for_salmon_habitat.pdf"


# Convert PDF to text
pdf_text <- pdf_text(pdf_path)


# Concatenate all pages into a single string
pdf_string_raw <- paste(pdf_text, collapse = " ")

# NOT IMPLEMENTED BUT could remove the table of contents by keeping everything after the last occurence of at least 10 periods
cut_string <- function(s) {
  matches <- str_locate_all(s, "\\.{10,}")
  if (length(matches[[1]]) > 0) {
    last_match <- tail(matches[[1]], 1)
    return(str_sub(s, start = last_match[2] + 1))
  } else {
    return(s)
  }
}

# clean the string
pdf_string <- pdf_string_raw %>%
  str_replace_all("\n", " ") %>%
  str_replace_all(" {3,99}", "") %>% #blank space
  str_replace_all("EDI ENVIRONMENTAL DYNAMICS INC.", " ") %>%
  str_remove_all("DRAFT\\d+") %>%
  str_remove_all("Playbook to Guide Landscape Recovery Strategies & Priorities for SalmonHabitat Following Major Wildfires") %>%
  str_replace_all("DRAFT [1-9][0-9]{0,2}", "") #Remove "DRAFT" followed by a space and a number between 1 and 300

pdf_string2 <- cut_string(pdf_string)

# Define the maximum chunk size.
token_max_size <- 10000
token_response_size <- 7000
token_prompt_size <- token_max_size - token_response_size
token_chars_per_word <- 5
max_chunk_size <- token_prompt_size * token_chars_per_word
max_words_chunk <- max_chunk_size / token_chars_per_word


split_into_chunks <- function(text, chunk_size) {
  # Split the text at nearest report section break based on the presence of a number followed by at least 3 capital letters
  potential_sections <- str_split(text, "(?<=\\s)(?=\\d+[A-Z]{3,})", simplify = TRUE)

  # Initialize an empty list to store the final chunks
  chunks <- list()

  # Loop over the potential sections
  for (section in potential_sections) {
    # If the section is too large, split it into smaller chunks
    if (str_length(section) > chunk_size) {
      n <- str_length(section)
      n_chunks <- ceiling(n / chunk_size)
      section_chunks <- map_chr(seq_len(n_chunks), ~str_sub(section, (.-1)*chunk_size+1, . * chunk_size))
      chunks <- c(chunks, section_chunks)
    } else {
      # Otherwise, add the section to the list of chunks
      chunks <- c(chunks, section)
    }
  }

  # Convert the list of chunks to a character vector
  chunks <- unlist(chunks)

  return(chunks)
}


# function to count words per chunk
count_words_punct <- function(sentence) {
  word_count <- str_count(sentence, "\\w+")
  punct_count <- str_count(sentence, "[[:punct:]]")

  total_count <- word_count + punct_count

  return(total_count)
}


pdf_chunks <- split_into_chunks(pdf_string, max_chunk_size) %>%
  discard(~all(is.na(.))) # Remove empty chunks - pdf_chunks[nzchar(pdf_chunks)] works too


# calculate the number of words per chunk using purrr
token_counts <- map_dbl(pdf_chunks, count_words_punct)


loop_chat <- function(input = NULL,
                      wait_time = 15, #we don't want to time out the API calls/minute
                      prompt_prep = 'please summarize the main points of this: ',
                      stream = FALSE,
                      ...) {
  # Create an empty vector to store the results
  results <- c()

  for (i in seq_along(input)) {
    # Make the API call and store the result
    results[i] <- chattr(paste0(prompt_prep, input[i]), stream = stream, ...)
    # Wait before the next iteration to not exceed the API rate limit/max number of tokens = 10000
    if(i < length(input)) {
      Sys.sleep(wait_time)
    }
  }


  # Return the results vector
  return(results)
}

sort(token_counts, decreasing = TRUE)[1:3]
# Get the indices of the sorted vector
sorted_indices <- order(token_counts, decreasing = TRUE)

# Get the indices of the top 3 elements
chunks_largest <- sorted_indices[1:3]

# find where the TOC ends so we can omit
pdf_chunks[16]

toc_ends <- 16

# source the summary3.R file made by three iterations of the entire report
source("~/Projects/repo/wildfire_summary3.R")

# build the prompt
prompt_built <- paste0(
  'please summarize the main themes of this, remove redundancies, do not include any quote characters (single or double)',
  'in your response and include only level 1 headers only using markdown syntax in your response.  ',
  'do not make a level 1 header called Summary - we want the actual level 1 headers from the etext you are reading.  Level 1 is single digit ',
  'followed by CAPITAL title ex. 1 INTRODUCTION and 2 WILDFIRE REOVERY PLANNING AND DECISION MAKING. We dont want level 2 headers please ex 6.2 DECISION-MAKING SUPPORRT TOOLS.',
  'Chunks dont necessarily start with a level 1 header. Thats ok. IMPORTANT - After this first colon in this prompt a quick summary of the whole document.  You are not summarizeing the text after ',
  'the colon until after a sequence of three colons and a space so please wait till you read the whole thing!! Before three colons and a space is only provided for ',
  'context!  You are getting the document in 25 chunks passed in a loop so remember that you will only need pieces of this summary of ',
  'context that will be followed by three colons and a space in each response.  All the text summarizd in this prompt will not ',
  'be present in the text you are to summarize and that is ok. Summarize text after the three colons ONLY.  Here is more context: ',
  summary3,
  'Now ITS GO TIME - SUMMARIZE THIS::: ')


# if we kept it real simple we could have called the loop_chat function to get an object
# t <- loop_chat(input = pdf_chunks[toc_ends + 1:length(pdf_chunks)],
#                prompt_prep = prompt_built)


# had a failure - need to find the chunk that failed
# string_to_find <- "Silvicultural/Hydrological" #second last chunk
# matches <- str_detect(pdf_chunks, fixed(string_to_find))
# which(matches)
# #[1] 51 55 64

# make a function that burns the output to an rmarkdown file that can be rendered or fed back as a prompt
write_to_file <- function(chunks_send = NULL,
                          output_file = "output.Rmd",
                          prompt_to_send = prompt_built,
                          wait_return = 15,
                          ...){
  writeLines(paste0("```{r include=FALSE} \n prompt_engineered <- c("), output_file)


  # Use purrr::map() to iterate over pdf_chunks
  purrr::map(chunks_send, ~{
    con <- file(output_file, open = "a")
    sink(con, append = TRUE)
    result <- loop_chat(input = .x,
                        prompt_prep = prompt_to_send,
                        ...)

    # If result is not a character string, convert it
    if(!is.character(result)) { result <- as.character(result) }

    # Add a comma after each result except the last one and put result in quotes with shQuote
    if(!is.na(.x) && !is.na(tail(chunks_send, 1)[[1]]) && .x != tail(chunks_send, 1)[[1]]) {
      result <- paste0(shQuote(result, type = "cmd"), ",")
    }
    # After the last entry, add a closing bracket and three backticks
    if(identical(.x , tail(chunks_send, 1)[[1]])) {
      result <- paste0(shQuote(result, type = "cmd"), ")\n```\n`r prompt_engineered`")
    }
    cat(result, file = output_file, append = TRUE)
    sink()
    close(con)

    # Pause for a bit to not go over API rate limits
    if(identical(.x, tail(.x, n = 1))){
      Sys.sleep(wait_return)
    }
  },
  .progress = list(
    type = "iterator",
    format = "Writing to file {cli::pb_bar} {cli::pb_percent}",
    clear = TRUE))
}

# test sending it all at onece to gpt-4-0125-preview
# I'm sorry, but I cannot provide a summary for the document as it exceeds my current capabilities to process and
# summarize such extensive and complex content. However, I can help answer questions or provide information on specific topics related to watershed recovery, salmon habitat restoration, or any other related subject. Please let me know how I can assist you further."
# write_to_file(chunks_send = pdf_string2,
#               output_file = "test.Rmd",
#               prompt_to_send = "please summarize the main themes of this, remove redundancies, do not include any quote characters (single or double): ",
#               wait_time = 0 #this passes to loop_chat and can be set to zero because we needed wait_return for some reason
# )

# Call the function in a test
write_to_file(chunks_send = pdf_chunks[c(toc_ends + 1,toc_ends + 2)],
              output_file = "test.Rmd",
              prompt_to_send = "summarize in 100 words or less: ",
              wait_time = 0 #this passes to loop_chat and can be set to zero because we needed wait_return for some reason
)

# call the whole thing
write_to_file(chunks_send = pdf_chunks,
              output_file = "output_gpt-4-0125-preview.Rmd",
              wait_time = 0 #this passes to loop_chat and can be set to zero because we needed wait_return for some reason
              )
	# Load required libraries
	{
	library(pdftools)
	library(tm)
	library(stringr)
	library(chattr)
	library(tidyverse)
	}

	chattr('which model are you and what are you up to? Please explain in less than 100 words', stream = FALSE)

	# Define the path to your PDF
	pdf_path <- "/Users/airvine/zotero/storage/B67TPQIS/guenther_2023_playbook_to_guide_landscape_recovery_strategies_&_priorities_for_salmon_habitat.pdf"


	# Convert PDF to text
	pdf_text <- pdf_text(pdf_path)


	# Concatenate all pages into a single string
	pdf_string_raw <- paste(pdf_text, collapse = " ")

	# NOT IMPLEMENTED BUT could remove the table of contents by keeping everything after the last occurence of at least 10 periods
	cut_string <- function(s) {
	matches <- str_locate_all(s, "\\.{10,}")
	if (length(matches[[1]]) > 0) {
	last_match <- tail(matches[[1]], 1)
	return(str_sub(s, start = last_match[2] + 1))
	} else {
	return(s)
	}
	}

	# clean the string
	pdf_string <- pdf_string_raw %>%
	str_replace_all("\n", " ") %>%
	str_replace_all(" {3,99}", "") %>% #blank space
	str_replace_all("EDI ENVIRONMENTAL DYNAMICS INC.", " ") %>%
	str_remove_all("DRAFT\\d+") %>%
	str_remove_all("Playbook to Guide Landscape Recovery Strategies & Priorities for SalmonHabitat Following Major Wildfires") %>%
	str_replace_all("DRAFT [1-9][0-9]{0,2}", "") #Remove "DRAFT" followed by a space and a number between 1 and 300

	pdf_string2 <- cut_string(pdf_string)

	# Define the maximum chunk size.
	token_max_size <- 10000
	token_response_size <- 7000
	token_prompt_size <- token_max_size - token_response_size
	token_chars_per_word <- 5
	max_chunk_size <- token_prompt_size * token_chars_per_word
	max_words_chunk <- max_chunk_size / token_chars_per_word


	split_into_chunks <- function(text, chunk_size) {
	# Split the text at nearest report section break based on the presence of a number followed by at least 3 capital letters
	potential_sections <- str_split(text, "(?<=\\s)(?=\\d+[A-Z]{3,})", simplify = TRUE)

	# Initialize an empty list to store the final chunks
	chunks <- list()

	# Loop over the potential sections
	for (section in potential_sections) {
	# If the section is too large, split it into smaller chunks
	if (str_length(section) > chunk_size) {
	n <- str_length(section)
	n_chunks <- ceiling(n / chunk_size)
	section_chunks <- map_chr(seq_len(n_chunks), ~str_sub(section, (.-1)chunk_size+1, . chunk_size))
	chunks <- c(chunks, section_chunks)
	} else {
	# Otherwise, add the section to the list of chunks
	chunks <- c(chunks, section)
	}
	}

	# Convert the list of chunks to a character vector
	chunks <- unlist(chunks)

	return(chunks)
	}


	# function to count words per chunk
	count_words_punct <- function(sentence) {
	word_count <- str_count(sentence, "\\w+")
	punct_count <- str_count(sentence, "[[:punct:]]")

	total_count <- word_count + punct_count

	return(total_count)
	}


	pdf_chunks <- split_into_chunks(pdf_string, max_chunk_size) %>%
	discard(~all(is.na(.))) # Remove empty chunks - pdf_chunks[nzchar(pdf_chunks)] works too


	# calculate the number of words per chunk using purrr
	token_counts <- map_dbl(pdf_chunks, count_words_punct)


	loop_chat <- function(input = NULL,
	wait_time = 15, #we don't want to time out the API calls/minute
	prompt_prep = 'please summarize the main points of this: ',
	stream = FALSE,
	...) {
	# Create an empty vector to store the results
	results <- c()

	for (i in seq_along(input)) {
	# Make the API call and store the result
	results[i] <- chattr(paste0(prompt_prep, input[i]), stream = stream, ...)
	# Wait before the next iteration to not exceed the API rate limit/max number of tokens = 10000
	if(i < length(input)) {
	Sys.sleep(wait_time)
	}
	}


	# Return the results vector
	return(results)
	}

	sort(token_counts, decreasing = TRUE)[1:3]
	# Get the indices of the sorted vector
	sorted_indices <- order(token_counts, decreasing = TRUE)

	# Get the indices of the top 3 elements
	chunks_largest <- sorted_indices[1:3]

	# find where the TOC ends so we can omit
	pdf_chunks[16]

	toc_ends <- 16

	# source the summary3.R file made by three iterations of the entire report
	source("~/Projects/repo/wildfire_summary3.R")

	# build the prompt
	prompt_built <- paste0(
	'please summarize the main themes of this, remove redundancies, do not include any quote characters (single or double)',
	'in your response and include only level 1 headers only using markdown syntax in your response. ',
	'do not make a level 1 header called Summary - we want the actual level 1 headers from the etext you are reading. Level 1 is single digit ',
	'followed by CAPITAL title ex. 1 INTRODUCTION and 2 WILDFIRE REOVERY PLANNING AND DECISION MAKING. We dont want level 2 headers please ex 6.2 DECISION-MAKING SUPPORRT TOOLS.',
	'Chunks dont necessarily start with a level 1 header. Thats ok. IMPORTANT - After this first colon in this prompt a quick summary of the whole document. You are not summarizeing the text after ',
	'the colon until after a sequence of three colons and a space so please wait till you read the whole thing!! Before three colons and a space is only provided for ',
	'context! You are getting the document in 25 chunks passed in a loop so remember that you will only need pieces of this summary of ',
	'context that will be followed by three colons and a space in each response. All the text summarizd in this prompt will not ',
	'be present in the text you are to summarize and that is ok. Summarize text after the three colons ONLY. Here is more context: ',
	summary3,
	'Now ITS GO TIME - SUMMARIZE THIS::: ')


	# if we kept it real simple we could have called the loop_chat function to get an object
	# t <- loop_chat(input = pdf_chunks[toc_ends + 1:length(pdf_chunks)],
	# prompt_prep = prompt_built)


	# had a failure - need to find the chunk that failed
	# string_to_find <- "Silvicultural/Hydrological" #second last chunk
	# matches <- str_detect(pdf_chunks, fixed(string_to_find))
	# which(matches)
	# #[1] 51 55 64

	# make a function that burns the output to an rmarkdown file that can be rendered or fed back as a prompt
	write_to_file <- function(chunks_send = NULL,
	output_file = "output.Rmd",
	prompt_to_send = prompt_built,
	wait_return = 15,
	...){
	writeLines(paste0("```{r include=FALSE} \n prompt_engineered <- c("), output_file)


	# Use purrr::map() to iterate over pdf_chunks
	purrr::map(chunks_send, ~{
	con <- file(output_file, open = "a")
	sink(con, append = TRUE)
	result <- loop_chat(input = .x,
	prompt_prep = prompt_to_send,
	...)

	# If result is not a character string, convert it
	if(!is.character(result)) { result <- as.character(result) }

	# Add a comma after each result except the last one and put result in quotes with shQuote
	if(!is.na(.x) && !is.na(tail(chunks_send, 1)[[1]]) && .x != tail(chunks_send, 1)[[1]]) {
	result <- paste0(shQuote(result, type = "cmd"), ",")
	}
	# After the last entry, add a closing bracket and three backticks
	if(identical(.x , tail(chunks_send, 1)[[1]])) {
	result <- paste0(shQuote(result, type = "cmd"), ")\n```\n`r prompt_engineered`")
	}
	cat(result, file = output_file, append = TRUE)
	sink()
	close(con)

	# Pause for a bit to not go over API rate limits
	if(identical(.x, tail(.x, n = 1))){
	Sys.sleep(wait_return)
	}
	},
	.progress = list(
	type = "iterator",
	format = "Writing to file {cli::pb_bar} {cli::pb_percent}",
	clear = TRUE))
	}

	# test sending it all at onece to gpt-4-0125-preview
	# I'm sorry, but I cannot provide a summary for the document as it exceeds my current capabilities to process and
	# summarize such extensive and complex content. However, I can help answer questions or provide information on specific topics related to watershed recovery, salmon habitat restoration, or any other related subject. Please let me know how I can assist you further."
	# write_to_file(chunks_send = pdf_string2,
	# output_file = "test.Rmd",
	# prompt_to_send = "please summarize the main themes of this, remove redundancies, do not include any quote characters (single or double): ",
	# wait_time = 0 #this passes to loop_chat and can be set to zero because we needed wait_return for some reason
	# )

	# Call the function in a test
	write_to_file(chunks_send = pdf_chunks[c(toc_ends + 1,toc_ends + 2)],
	output_file = "test.Rmd",
	prompt_to_send = "summarize in 100 words or less: ",
	wait_time = 0 #this passes to loop_chat and can be set to zero because we needed wait_return for some reason
	)

	# call the whole thing
	write_to_file(chunks_send = pdf_chunks,
	output_file = "output_gpt-4-0125-preview.Rmd",
	wait_time = 0 #this passes to loop_chat and can be set to zero because we needed wait_return for some reason
	)