masta-g3/summarize_by_parts.py

## summarize_by_parts.py

%load_ext autoreload
%autoreload 2

# Summarizer

from langchain.chat_models import ChatOpenAI, AzureChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import ChatPromptTemplate
from langchain.document_loaders import ArxivLoader
from langchain.chains import LLMChain
from langchain.callbacks import get_openai_callback
import tiktoken

import os
import re
import json
import arxiv
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
from dotenv import load_dotenv

warnings.filterwarnings("ignore")
load_dotenv()

encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 5000,
    chunk_overlap  = 100,
    length_function = len,
    is_separator_regex = False,
)

def summarize_by_segments(paper_title: str, document: str):
    """ Summarize a paper by segments. """
    doc_chunks = text_splitter.create_documents([document])
    ## First section.
    current_chunk = doc_chunks[0].page_content
    summary_notes = numbered_to_bullet_list(
        chain.run({"paper_title": paper_title,
                   "previous_notes": "*(No notes, just starting to read.)*",
                   "content": current_chunk}) + "\n")

    ## All other sections.
    for current_chunk in tqdm(doc_chunks[1:]):
        summary_notes += numbered_to_bullet_list(
            chain.run({"paper_title": paper_title,
                       "previous_notes": summary_notes,
                       "content": current_chunk.page_content}) + "\n")

    return summary_notes


def numbered_to_bullet_list(list_str: str):
    """ Convert a numbered list to a bullet list. """
    list_str = re.sub(r'^\d+\.', r'-', list_str, flags=re.MULTILINE)
    return list_str

## LLM Chain Setup

## Underlying LLM.
llm = AzureChatOpenAI(deployment_name="gpt-35-turbo-16k", temperature=0.1)

## Create prompt.
system_message = """You are an applied AI researcher specialized in the field of Large Language Models (LLMs), and you are currently reviewing the academic paper "{paper_title}". Your goal is to analyze the paper, identify the main constributions and most interesting findings, and write a summary of it in your own words. This summary will serve as reference for future LLM researchers within your organization, so it is very important that you are able to convey the main ideas in a clear, complete and concise manner.

You have already read through some of the initial sections and taken some notes:

{previous_notes}

Now you must read over the following section and continue expanding on your notes (without repeating information).

{content}

## Guidelines
- Make sure to identify connections between the paper segments and the notes you have already taken. Avoid duplicate comments and ensure that your summary is coherent.
- Focus on the bigger picture and the main ideas, rather than on the details.
- If a table is presented just report back the main findings.
- Be sure to explain any new concept or term you introduce. Explain how things work, and be precise when discussing metrics and results.
- If examples are provided you can include them in your notes, as long as they help clarify the main ideas.
- Take your notes in the form of a numbered list. Do not include headers or any other elements.
- Do not include more than 5 items in your list.
- Do not repeat information that is already present in your previous notes.
- Your summary must be shorter than the original paper.
"""

prompt = ChatPromptTemplate.from_messages([("system", system_message)])
chain = LLMChain(llm=llm, prompt=prompt)

## Summarize Paper

paper_names = [
    " Chain-of-verification reduces hallucination in large language models",
    "Discovering language model behaviors with model-written evaluations",
    "Measuring and narrowing the compositionality gap in language models.",
    "Text Rendering Strategies for Pixel Language Models",
    "Cappy: Outperforming and Boosting Large Multi-Task LMs with a Small Scorer",
    "GPQA: A Graduate-Level Google-Proof Q&A Benchmark",
    "System 2 Attention (is something you might need too)",
    "Plan-and-Solve Prompting: Improving Zero-Shot Chain-of-Thought Reasoning by Large Language Models",
]

print(len(paper_names))

docs = ArxivLoader(query=preprocess(paper_names[0]), load_max_docs=1).load()

## Extract data.
paper_content = docs[0].page_content
paper_title = docs[0].metadata["Title"]

with get_openai_callback() as cb:
    token_count = 999999999
    token_diff = 999999999
    i = 1
    ori_token_count = len(encoding.encode(docs[0].page_content))
    print(f"Starting tokens: {ori_token_count}")
    while token_count > 500 and token_diff > 200:
        print("------------------------")
        print(f"Summarization iteration {i}...")
        paper_content = summarize_by_segments(paper_title, paper_content)

        token_diff = token_count - len(encoding.encode(paper_content))
        token_count = len(encoding.encode(paper_content))
        frac = len(encoding.encode(paper_content)) / len(encoding.encode(docs[0].page_content))
        i += 1
        print(f"Total tokens: {token_count}")
        print(f"Compression: {frac:.2f}")

    print("==================")
    print("Done! Usage stats:")
    print(cb)

print(paper_content)

## Narrative Form

narrative_system_msg = """You are an expert New York Times technology writer tasked with writing a summary of "{paper_title}". Your task is to read the following set of notes and convert it into an engaging paragraph. You must not alter the meaning of the notes, but you can reorganize and rephrase in order to improve the flow of the paragraph. You should also abstain from making unwarranted inferences and avoid bombastic language.

{previous_notes}
"""

narrative_prompt = ChatPromptTemplate.from_messages([("system", narrative_system_msg)])
narrative_chain = LLMChain(llm=llm, prompt=narrative_prompt)

with get_openai_callback() as cb:
    narrative = narrative_chain.run({"paper_title": paper_title,
                                     "previous_notes": paper_content})
    print(narrative)
    print("==================")
    print("Done! Usage stats:")
    print(cb)

print(narrative)

print(len(encoding.encode(narrative)))

## Copywriting

copywriting_system_msg = """You are a copywriter tasked with reviewing the following summary of "{paper_title}" and improving it. Your goal is to make the summary more engaging and readable, remove duplicate content, and  preserve the meaning of the original text. You can reorganize and rephrase the text as you see fit, but you must not alter the meaning of the text. Your output should be a single paragraph.

{previous_summary}

"""

copywriting_prompt = ChatPromptTemplate.from_messages([("system", copywriting_system_msg)])
copywriting_chain = LLMChain(llm=llm, prompt=copywriting_prompt)

with get_openai_callback() as cb:
    copywriting = copywriting_chain.run({"paper_title": paper_title,
                                         "previous_summary": narrative})
    print(copywriting)
    print("==================")
    print("Done! Usage stats:")
    print(cb)

print(copywriting)

print(len(encoding.encode(copywriting)))

	%load_ext autoreload
	%autoreload 2

	# Summarizer

	from langchain.chat_models import ChatOpenAI, AzureChatOpenAI
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.prompts import ChatPromptTemplate
	from langchain.document_loaders import ArxivLoader
	from langchain.chains import LLMChain
	from langchain.callbacks import get_openai_callback
	import tiktoken

	import os
	import re
	import json
	import arxiv
	import pandas as pd
	from tqdm import tqdm
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import warnings
	from dotenv import load_dotenv

	warnings.filterwarnings("ignore")
	load_dotenv()

	encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size = 5000,
	chunk_overlap = 100,
	length_function = len,
	is_separator_regex = False,
	)

	def summarize_by_segments(paper_title: str, document: str):
	""" Summarize a paper by segments. """
	doc_chunks = text_splitter.create_documents([document])
	## First section.
	current_chunk = doc_chunks[0].page_content
	summary_notes = numbered_to_bullet_list(
	chain.run({"paper_title": paper_title,
	"previous_notes": "(No notes, just starting to read.)",
	"content": current_chunk}) + "\n")

	## All other sections.
	for current_chunk in tqdm(doc_chunks[1:]):
	summary_notes += numbered_to_bullet_list(
	chain.run({"paper_title": paper_title,
	"previous_notes": summary_notes,
	"content": current_chunk.page_content}) + "\n")

	return summary_notes


	def numbered_to_bullet_list(list_str: str):
	""" Convert a numbered list to a bullet list. """
	list_str = re.sub(r'^\d+\.', r'-', list_str, flags=re.MULTILINE)
	return list_str

	## LLM Chain Setup

	## Underlying LLM.
	llm = AzureChatOpenAI(deployment_name="gpt-35-turbo-16k", temperature=0.1)

	## Create prompt.
	system_message = """You are an applied AI researcher specialized in the field of Large Language Models (LLMs), and you are currently reviewing the academic paper "{paper_title}". Your goal is to analyze the paper, identify the main constributions and most interesting findings, and write a summary of it in your own words. This summary will serve as reference for future LLM researchers within your organization, so it is very important that you are able to convey the main ideas in a clear, complete and concise manner.

	You have already read through some of the initial sections and taken some notes:

	{previous_notes}

	Now you must read over the following section and continue expanding on your notes (without repeating information).

	{content}

	## Guidelines
	- Make sure to identify connections between the paper segments and the notes you have already taken. Avoid duplicate comments and ensure that your summary is coherent.
	- Focus on the bigger picture and the main ideas, rather than on the details.
	- If a table is presented just report back the main findings.
	- Be sure to explain any new concept or term you introduce. Explain how things work, and be precise when discussing metrics and results.
	- If examples are provided you can include them in your notes, as long as they help clarify the main ideas.
	- Take your notes in the form of a numbered list. Do not include headers or any other elements.
	- Do not include more than 5 items in your list.
	- Do not repeat information that is already present in your previous notes.
	- Your summary must be shorter than the original paper.
	"""

	prompt = ChatPromptTemplate.from_messages([("system", system_message)])
	chain = LLMChain(llm=llm, prompt=prompt)

	## Summarize Paper

	paper_names = [
	" Chain-of-verification reduces hallucination in large language models",
	"Discovering language model behaviors with model-written evaluations",
	"Measuring and narrowing the compositionality gap in language models.",
	"Text Rendering Strategies for Pixel Language Models",
	"Cappy: Outperforming and Boosting Large Multi-Task LMs with a Small Scorer",
	"GPQA: A Graduate-Level Google-Proof Q&A Benchmark",
	"System 2 Attention (is something you might need too)",
	"Plan-and-Solve Prompting: Improving Zero-Shot Chain-of-Thought Reasoning by Large Language Models",
	]

	print(len(paper_names))

	docs = ArxivLoader(query=preprocess(paper_names[0]), load_max_docs=1).load()

	## Extract data.
	paper_content = docs[0].page_content
	paper_title = docs[0].metadata["Title"]

	with get_openai_callback() as cb:
	token_count = 999999999
	token_diff = 999999999
	i = 1
	ori_token_count = len(encoding.encode(docs[0].page_content))
	print(f"Starting tokens: {ori_token_count}")
	while token_count > 500 and token_diff > 200:
	print("------------------------")
	print(f"Summarization iteration {i}...")
	paper_content = summarize_by_segments(paper_title, paper_content)

	token_diff = token_count - len(encoding.encode(paper_content))
	token_count = len(encoding.encode(paper_content))
	frac = len(encoding.encode(paper_content)) / len(encoding.encode(docs[0].page_content))
	i += 1
	print(f"Total tokens: {token_count}")
	print(f"Compression: {frac:.2f}")

	print("==================")
	print("Done! Usage stats:")
	print(cb)

	print(paper_content)

	## Narrative Form

	narrative_system_msg = """You are an expert New York Times technology writer tasked with writing a summary of "{paper_title}". Your task is to read the following set of notes and convert it into an engaging paragraph. You must not alter the meaning of the notes, but you can reorganize and rephrase in order to improve the flow of the paragraph. You should also abstain from making unwarranted inferences and avoid bombastic language.

	{previous_notes}
	"""

	narrative_prompt = ChatPromptTemplate.from_messages([("system", narrative_system_msg)])
	narrative_chain = LLMChain(llm=llm, prompt=narrative_prompt)

	with get_openai_callback() as cb:
	narrative = narrative_chain.run({"paper_title": paper_title,
	"previous_notes": paper_content})
	print(narrative)
	print("==================")
	print("Done! Usage stats:")
	print(cb)

	print(narrative)

	print(len(encoding.encode(narrative)))

	## Copywriting

	copywriting_system_msg = """You are a copywriter tasked with reviewing the following summary of "{paper_title}" and improving it. Your goal is to make the summary more engaging and readable, remove duplicate content, and preserve the meaning of the original text. You can reorganize and rephrase the text as you see fit, but you must not alter the meaning of the text. Your output should be a single paragraph.

	{previous_summary}

	"""

	copywriting_prompt = ChatPromptTemplate.from_messages([("system", copywriting_system_msg)])
	copywriting_chain = LLMChain(llm=llm, prompt=copywriting_prompt)

	with get_openai_callback() as cb:
	copywriting = copywriting_chain.run({"paper_title": paper_title,
	"previous_summary": narrative})
	print(copywriting)
	print("==================")
	print("Done! Usage stats:")
	print(cb)

	print(copywriting)

	print(len(encoding.encode(copywriting)))