Skip to content

Instantly share code, notes, and snippets.

@masta-g3
Created December 1, 2023 02:02
Show Gist options
  • Save masta-g3/f1ce2fd033af41c440c5e225137c278f to your computer and use it in GitHub Desktop.
Save masta-g3/f1ce2fd033af41c440c5e225137c278f to your computer and use it in GitHub Desktop.
%load_ext autoreload
%autoreload 2
# Summarizer
from langchain.chat_models import ChatOpenAI, AzureChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import ChatPromptTemplate
from langchain.document_loaders import ArxivLoader
from langchain.chains import LLMChain
from langchain.callbacks import get_openai_callback
import tiktoken
import os
import re
import json
import arxiv
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
from dotenv import load_dotenv
warnings.filterwarnings("ignore")
load_dotenv()
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 5000,
chunk_overlap = 100,
length_function = len,
is_separator_regex = False,
)
def summarize_by_segments(paper_title: str, document: str):
""" Summarize a paper by segments. """
doc_chunks = text_splitter.create_documents([document])
## First section.
current_chunk = doc_chunks[0].page_content
summary_notes = numbered_to_bullet_list(
chain.run({"paper_title": paper_title,
"previous_notes": "*(No notes, just starting to read.)*",
"content": current_chunk}) + "\n")
## All other sections.
for current_chunk in tqdm(doc_chunks[1:]):
summary_notes += numbered_to_bullet_list(
chain.run({"paper_title": paper_title,
"previous_notes": summary_notes,
"content": current_chunk.page_content}) + "\n")
return summary_notes
def numbered_to_bullet_list(list_str: str):
""" Convert a numbered list to a bullet list. """
list_str = re.sub(r'^\d+\.', r'-', list_str, flags=re.MULTILINE)
return list_str
## LLM Chain Setup
## Underlying LLM.
llm = AzureChatOpenAI(deployment_name="gpt-35-turbo-16k", temperature=0.1)
## Create prompt.
system_message = """You are an applied AI researcher specialized in the field of Large Language Models (LLMs), and you are currently reviewing the academic paper "{paper_title}". Your goal is to analyze the paper, identify the main constributions and most interesting findings, and write a summary of it in your own words. This summary will serve as reference for future LLM researchers within your organization, so it is very important that you are able to convey the main ideas in a clear, complete and concise manner.
You have already read through some of the initial sections and taken some notes:
{previous_notes}
Now you must read over the following section and continue expanding on your notes (without repeating information).
{content}
## Guidelines
- Make sure to identify connections between the paper segments and the notes you have already taken. Avoid duplicate comments and ensure that your summary is coherent.
- Focus on the bigger picture and the main ideas, rather than on the details.
- If a table is presented just report back the main findings.
- Be sure to explain any new concept or term you introduce. Explain how things work, and be precise when discussing metrics and results.
- If examples are provided you can include them in your notes, as long as they help clarify the main ideas.
- Take your notes in the form of a numbered list. Do not include headers or any other elements.
- Do not include more than 5 items in your list.
- Do not repeat information that is already present in your previous notes.
- Your summary must be shorter than the original paper.
"""
prompt = ChatPromptTemplate.from_messages([("system", system_message)])
chain = LLMChain(llm=llm, prompt=prompt)
## Summarize Paper
paper_names = [
" Chain-of-verification reduces hallucination in large language models",
"Discovering language model behaviors with model-written evaluations",
"Measuring and narrowing the compositionality gap in language models.",
"Text Rendering Strategies for Pixel Language Models",
"Cappy: Outperforming and Boosting Large Multi-Task LMs with a Small Scorer",
"GPQA: A Graduate-Level Google-Proof Q&A Benchmark",
"System 2 Attention (is something you might need too)",
"Plan-and-Solve Prompting: Improving Zero-Shot Chain-of-Thought Reasoning by Large Language Models",
]
print(len(paper_names))
docs = ArxivLoader(query=preprocess(paper_names[0]), load_max_docs=1).load()
## Extract data.
paper_content = docs[0].page_content
paper_title = docs[0].metadata["Title"]
with get_openai_callback() as cb:
token_count = 999999999
token_diff = 999999999
i = 1
ori_token_count = len(encoding.encode(docs[0].page_content))
print(f"Starting tokens: {ori_token_count}")
while token_count > 500 and token_diff > 200:
print("------------------------")
print(f"Summarization iteration {i}...")
paper_content = summarize_by_segments(paper_title, paper_content)
token_diff = token_count - len(encoding.encode(paper_content))
token_count = len(encoding.encode(paper_content))
frac = len(encoding.encode(paper_content)) / len(encoding.encode(docs[0].page_content))
i += 1
print(f"Total tokens: {token_count}")
print(f"Compression: {frac:.2f}")
print("==================")
print("Done! Usage stats:")
print(cb)
print(paper_content)
## Narrative Form
narrative_system_msg = """You are an expert New York Times technology writer tasked with writing a summary of "{paper_title}". Your task is to read the following set of notes and convert it into an engaging paragraph. You must not alter the meaning of the notes, but you can reorganize and rephrase in order to improve the flow of the paragraph. You should also abstain from making unwarranted inferences and avoid bombastic language.
{previous_notes}
"""
narrative_prompt = ChatPromptTemplate.from_messages([("system", narrative_system_msg)])
narrative_chain = LLMChain(llm=llm, prompt=narrative_prompt)
with get_openai_callback() as cb:
narrative = narrative_chain.run({"paper_title": paper_title,
"previous_notes": paper_content})
print(narrative)
print("==================")
print("Done! Usage stats:")
print(cb)
print(narrative)
print(len(encoding.encode(narrative)))
## Copywriting
copywriting_system_msg = """You are a copywriter tasked with reviewing the following summary of "{paper_title}" and improving it. Your goal is to make the summary more engaging and readable, remove duplicate content, and preserve the meaning of the original text. You can reorganize and rephrase the text as you see fit, but you must not alter the meaning of the text. Your output should be a single paragraph.
{previous_summary}
"""
copywriting_prompt = ChatPromptTemplate.from_messages([("system", copywriting_system_msg)])
copywriting_chain = LLMChain(llm=llm, prompt=copywriting_prompt)
with get_openai_callback() as cb:
copywriting = copywriting_chain.run({"paper_title": paper_title,
"previous_summary": narrative})
print(copywriting)
print("==================")
print("Done! Usage stats:")
print(cb)
print(copywriting)
print(len(encoding.encode(copywriting)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment