-
-
Save zilto/1464b703904378abc6e7dd78af212aeb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# summarization.py | |
# ... imports | |
def summarize_chunk_of_text_prompt(content_type: str = "an academic paper") -> str: | |
"""Base prompt for summarizing chunks of text.""" | |
return f"Summarize this text from {content_type}. Extract any key points with reasoning.\n\nContent:" | |
def summarize_text_from_summaries_prompt(content_type: str = "an academic paper") -> str: | |
"""Prompt for summarizing a paper from a list of summaries.""" | |
return f"""Write a summary collated from this collection of key points extracted from {content_type}. | |
The summary should highlight the core argument, conclusions and evidence, and answer the user's query. | |
User query: {{query}} | |
The summary should be structured in bulleted lists following the headings Core Argument, Evidence, and Conclusions. | |
Key points:\n{{results}}\nSummary:\n""" | |
@config.when(file_type="pdf") | |
def raw_text__pdf(pdf_source: str | bytes | tempfile.SpooledTemporaryFile) -> str: | |
"""Takes a filepath to a PDF and returns a string of the PDF's contents | |
:param pdf_source: Series of filepaths to PDFs | |
:return: Series of strings of the PDFs' contents | |
""" | |
reader = PdfReader(pdf_source) | |
_pdf_text = "" | |
page_number = 0 | |
for page in reader.pages: | |
page_number += 1 | |
_pdf_text += page.extract_text() + f"\nPage Number: {page_number}" | |
return _pdf_text | |
# ... | |
def chunked_text( | |
raw_text: str, max_token_length: int = 1500, tokenizer_encoding: str = "cl100k_base" | |
) -> list[str]: | |
"""Chunks the pdf text into smaller chunks of size max_token_length. | |
:param pdf_text: the Series of individual pdf texts to chunk. | |
:param max_token_length: the maximum length of tokens in each chunk. | |
:param tokenizer_encoding: the encoding to use for the tokenizer. | |
:return: Series of chunked pdf text. Each element is a list of chunks. | |
""" | |
tokenizer = tiktoken.get_encoding(tokenizer_encoding) | |
_encoded_chunks = _create_chunks(raw_text, max_token_length, tokenizer) | |
_decoded_chunks = [tokenizer.decode(chunk) for chunk in _encoded_chunks] | |
return _decoded_chunks | |
# ... | |
def summarized_text( | |
prompt_and_text_content: str, | |
openai_gpt_model: str, | |
) -> str: | |
"""Summarizes the text from the summarized chunks of the pdf. | |
:param prompt_and_text_content: the prompt and content to send over. | |
:param openai_gpt_model: which openai gpt model to use. | |
:return: the string response from the openai API. | |
""" | |
response = openai.ChatCompletion.create( | |
model=openai_gpt_model, | |
messages=[ | |
{ | |
"role": "user", | |
"content": prompt_and_text_content, | |
} | |
], | |
temperature=0, | |
) | |
return response["choices"][0]["message"]["content"] | |
if __name__ == "__main__": | |
# run as a script to test Hamilton's execution | |
import summarization | |
from hamilton import base, driver | |
dr = driver.Driver( | |
{}, | |
summarization, | |
adapter=base.SimplePythonGraphAdapter(base.DictResult()), | |
) | |
dr.display_all_functions("summary", {"format": "png"}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment