Skip to content

Instantly share code, notes, and snippets.

@zilto
Last active August 11, 2023 21:16
Show Gist options
  • Save zilto/1464b703904378abc6e7dd78af212aeb to your computer and use it in GitHub Desktop.
Save zilto/1464b703904378abc6e7dd78af212aeb to your computer and use it in GitHub Desktop.
# summarization.py
# ... imports
def summarize_chunk_of_text_prompt(content_type: str = "an academic paper") -> str:
"""Base prompt for summarizing chunks of text."""
return f"Summarize this text from {content_type}. Extract any key points with reasoning.\n\nContent:"
def summarize_text_from_summaries_prompt(content_type: str = "an academic paper") -> str:
"""Prompt for summarizing a paper from a list of summaries."""
return f"""Write a summary collated from this collection of key points extracted from {content_type}.
The summary should highlight the core argument, conclusions and evidence, and answer the user's query.
User query: {{query}}
The summary should be structured in bulleted lists following the headings Core Argument, Evidence, and Conclusions.
Key points:\n{{results}}\nSummary:\n"""
@config.when(file_type="pdf")
def raw_text__pdf(pdf_source: str | bytes | tempfile.SpooledTemporaryFile) -> str:
"""Takes a filepath to a PDF and returns a string of the PDF's contents
:param pdf_source: Series of filepaths to PDFs
:return: Series of strings of the PDFs' contents
"""
reader = PdfReader(pdf_source)
_pdf_text = ""
page_number = 0
for page in reader.pages:
page_number += 1
_pdf_text += page.extract_text() + f"\nPage Number: {page_number}"
return _pdf_text
# ...
def chunked_text(
raw_text: str, max_token_length: int = 1500, tokenizer_encoding: str = "cl100k_base"
) -> list[str]:
"""Chunks the pdf text into smaller chunks of size max_token_length.
:param pdf_text: the Series of individual pdf texts to chunk.
:param max_token_length: the maximum length of tokens in each chunk.
:param tokenizer_encoding: the encoding to use for the tokenizer.
:return: Series of chunked pdf text. Each element is a list of chunks.
"""
tokenizer = tiktoken.get_encoding(tokenizer_encoding)
_encoded_chunks = _create_chunks(raw_text, max_token_length, tokenizer)
_decoded_chunks = [tokenizer.decode(chunk) for chunk in _encoded_chunks]
return _decoded_chunks
# ...
def summarized_text(
prompt_and_text_content: str,
openai_gpt_model: str,
) -> str:
"""Summarizes the text from the summarized chunks of the pdf.
:param prompt_and_text_content: the prompt and content to send over.
:param openai_gpt_model: which openai gpt model to use.
:return: the string response from the openai API.
"""
response = openai.ChatCompletion.create(
model=openai_gpt_model,
messages=[
{
"role": "user",
"content": prompt_and_text_content,
}
],
temperature=0,
)
return response["choices"][0]["message"]["content"]
if __name__ == "__main__":
# run as a script to test Hamilton's execution
import summarization
from hamilton import base, driver
dr = driver.Driver(
{},
summarization,
adapter=base.SimplePythonGraphAdapter(base.DictResult()),
)
dr.display_all_functions("summary", {"format": "png"})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment