zilto/hamilton_pdf_hamilton_func.py Secret

## hamilton_pdf_hamilton_func.py
# summarization.py
# ... imports

def summarize_chunk_of_text_prompt(content_type: str = "an academic paper") -> str:
    """Base prompt for summarizing chunks of text."""
    return f"Summarize this text from {content_type}. Extract any key points with reasoning.\n\nContent:"


def summarize_text_from_summaries_prompt(content_type: str = "an academic paper") -> str:
    """Prompt for summarizing a paper from a list of summaries."""
    return f"""Write a summary collated from this collection of key points extracted from {content_type}.
    The summary should highlight the core argument, conclusions and evidence, and answer the user's query.
    User query: {{query}}
    The summary should be structured in bulleted lists following the headings Core Argument, Evidence, and Conclusions.
    Key points:\n{{results}}\nSummary:\n"""


@config.when(file_type="pdf")
def raw_text__pdf(pdf_source: str | bytes | tempfile.SpooledTemporaryFile) -> str:
    """Takes a filepath to a PDF and returns a string of the PDF's contents
    :param pdf_source: Series of filepaths to PDFs
    :return: Series of strings of the PDFs' contents
    """
    reader = PdfReader(pdf_source)
    _pdf_text = ""
    page_number = 0
    for page in reader.pages:
        page_number += 1
        _pdf_text += page.extract_text() + f"\nPage Number: {page_number}"
    return _pdf_text

# ...

def chunked_text(
    raw_text: str, max_token_length: int = 1500, tokenizer_encoding: str = "cl100k_base"
) -> list[str]:
    """Chunks the pdf text into smaller chunks of size max_token_length.
    :param pdf_text: the Series of individual pdf texts to chunk.
    :param max_token_length: the maximum length of tokens in each chunk.
    :param tokenizer_encoding: the encoding to use for the tokenizer.
    :return: Series of chunked pdf text. Each element is a list of chunks.
    """
    tokenizer = tiktoken.get_encoding(tokenizer_encoding)
    _encoded_chunks = _create_chunks(raw_text, max_token_length, tokenizer)
    _decoded_chunks = [tokenizer.decode(chunk) for chunk in _encoded_chunks]
    return _decoded_chunks

# ...

def summarized_text(
    prompt_and_text_content: str,
    openai_gpt_model: str,
) -> str:
    """Summarizes the text from the summarized chunks of the pdf.
    :param prompt_and_text_content: the prompt and content to send over.
    :param openai_gpt_model: which openai gpt model to use.
    :return: the string response from the openai API.
    """
    response = openai.ChatCompletion.create(
        model=openai_gpt_model,
        messages=[
            {
                "role": "user",
                "content": prompt_and_text_content,
            }
        ],
        temperature=0,
    )
    return response["choices"][0]["message"]["content"]


if __name__ == "__main__":
    # run as a script to test Hamilton's execution
    import summarization

    from hamilton import base, driver

    dr = driver.Driver(
        {},
        summarization,
        adapter=base.SimplePythonGraphAdapter(base.DictResult()),
    )
    dr.display_all_functions("summary", {"format": "png"})
	# summarization.py
	# ... imports

	def summarize_chunk_of_text_prompt(content_type: str = "an academic paper") -> str:
	"""Base prompt for summarizing chunks of text."""
	return f"Summarize this text from {content_type}. Extract any key points with reasoning.\n\nContent:"


	def summarize_text_from_summaries_prompt(content_type: str = "an academic paper") -> str:
	"""Prompt for summarizing a paper from a list of summaries."""
	return f"""Write a summary collated from this collection of key points extracted from {content_type}.
	The summary should highlight the core argument, conclusions and evidence, and answer the user's query.
	User query: {{query}}
	The summary should be structured in bulleted lists following the headings Core Argument, Evidence, and Conclusions.
	Key points:\n{{results}}\nSummary:\n"""


	@config.when(file_type="pdf")
	def raw_text__pdf(pdf_source: str \| bytes \| tempfile.SpooledTemporaryFile) -> str:
	"""Takes a filepath to a PDF and returns a string of the PDF's contents
	:param pdf_source: Series of filepaths to PDFs
	:return: Series of strings of the PDFs' contents
	"""
	reader = PdfReader(pdf_source)
	_pdf_text = ""
	page_number = 0
	for page in reader.pages:
	page_number += 1
	_pdf_text += page.extract_text() + f"\nPage Number: {page_number}"
	return _pdf_text

	# ...

	def chunked_text(
	raw_text: str, max_token_length: int = 1500, tokenizer_encoding: str = "cl100k_base"
	) -> list[str]:
	"""Chunks the pdf text into smaller chunks of size max_token_length.
	:param pdf_text: the Series of individual pdf texts to chunk.
	:param max_token_length: the maximum length of tokens in each chunk.
	:param tokenizer_encoding: the encoding to use for the tokenizer.
	:return: Series of chunked pdf text. Each element is a list of chunks.
	"""
	tokenizer = tiktoken.get_encoding(tokenizer_encoding)
	_encoded_chunks = _create_chunks(raw_text, max_token_length, tokenizer)
	_decoded_chunks = [tokenizer.decode(chunk) for chunk in _encoded_chunks]
	return _decoded_chunks

	# ...

	def summarized_text(
	prompt_and_text_content: str,
	openai_gpt_model: str,
	) -> str:
	"""Summarizes the text from the summarized chunks of the pdf.
	:param prompt_and_text_content: the prompt and content to send over.
	:param openai_gpt_model: which openai gpt model to use.
	:return: the string response from the openai API.
	"""
	response = openai.ChatCompletion.create(
	model=openai_gpt_model,
	messages=[
	{
	"role": "user",
	"content": prompt_and_text_content,
	}
	],
	temperature=0,
	)
	return response["choices"][0]["message"]["content"]


	if __name__ == "__main__":
	# run as a script to test Hamilton's execution
	import summarization

	from hamilton import base, driver

	dr = driver.Driver(
	{},
	summarization,
	adapter=base.SimplePythonGraphAdapter(base.DictResult()),
	)
	dr.display_all_functions("summary", {"format": "png"})