Skip to content

Instantly share code, notes, and snippets.

@harperreed
Created September 16, 2023 03:16
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save harperreed/b7b1ae56df4a269925e0668b823693eb to your computer and use it in GitHub Desktop.
Save harperreed/b7b1ae56df4a269925e0668b823693eb to your computer and use it in GitHub Desktop.
Chain of Density Summarization
# Import required packages
from dotenv import load_dotenv # For managing environment variables
from html2text import html2text # For HTML to markdown conversion
from readability import Document # For summarizing HTML content
from typing import List # For type hinting
import json # For JSON parsing
import logging # For logging errors
import openai # OpenAI GPT API
import os # For OS-level operations
import requests # For HTTP requests
import tiktoken # For token counting
#
# ▄████▄ ██░ ██ ▄▄▄ ██▓ ███▄ █ ▒█████ █████▒
# ▒██▀ ▀█ ▓██░ ██▒▒████▄ ▓██▒ ██ ▀█ █ ▒██▒ ██▒▓██ ▒
# ▒▓█ ▄ ▒██▀▀██░▒██ ▀█▄ ▒██▒▓██ ▀█ ██▒ ▒██░ ██▒▒████ ░
# ▒▓▓▄ ▄██▒░▓█ ░██ ░██▄▄▄▄██ ░██░▓██▒ ▐▌██▒ ▒██ ██░░▓█▒ ░
# ▒ ▓███▀ ░░▓█▒░██▓ ▓█ ▓██▒░██░▒██░ ▓██░ ░ ████▓▒░░▒█░
# ░ ░▒ ▒ ░ ▒ ░░▒░▒ ▒▒ ▓▒█░░▓ ░ ▒░ ▒ ▒ ░ ▒░▒░▒░ ▒ ░
# ░ ▒ ▒ ░▒░ ░ ▒ ▒▒ ░ ▒ ░░ ░░ ░ ▒░ ░ ▒ ▒░ ░
# ░ ░ ░░ ░ ░ ▒ ▒ ░ ░ ░ ░ ░ ░ ░ ▒ ░ ░
# ░ ░ ░ ░ ░ ░ ░ ░ ░ ░ ░
# ░
# ▓█████▄ ▓█████ ███▄ █ ██████ ██▓▄▄▄█████▓▓██ ██▓
# ▒██▀ ██▌▓█ ▀ ██ ▀█ █ ▒██ ▒ ▓██▒▓ ██▒ ▓▒ ▒██ ██▒
# ░██ █▌▒███ ▓██ ▀█ ██▒░ ▓██▄ ▒██▒▒ ▓██░ ▒░ ▒██ ██░
# ░▓█▄ ▌▒▓█ ▄ ▓██▒ ▐▌██▒ ▒ ██▒░██░░ ▓██▓ ░ ░ ▐██▓░
# ░▒████▓ ░▒████▒▒██░ ▓██░▒██████▒▒░██░ ▒██▒ ░ ░ ██▒▓░
# ▒▒▓ ▒ ░░ ▒░ ░░ ▒░ ▒ ▒ ▒ ▒▓▒ ▒ ░░▓ ▒ ░░ ██▒▒▒
# ░ ▒ ▒ ░ ░ ░░ ░░ ░ ▒░░ ░▒ ░ ░ ▒ ░ ░ ▓██ ░▒░
# ░ ░ ░ ░ ░ ░ ░ ░ ░ ░ ▒ ░ ░ ▒ ▒ ░░
# ░ ░ ░ ░ ░ ░ ░ ░
# ░ ░ ░
# ██▓███ ██▀███ ▒█████ ███▄ ▄███▓ ██▓███ ▄▄▄█████▓ ██▓ ███▄ █ ▄████
# ▓██░ ██▒▓██ ▒ ██▒▒██▒ ██▒▓██▒▀█▀ ██▒▓██░ ██▒▓ ██▒ ▓▒▓██▒ ██ ▀█ █ ██▒ ▀█▒
# ▓██░ ██▓▒▓██ ░▄█ ▒▒██░ ██▒▓██ ▓██░▓██░ ██▓▒▒ ▓██░ ▒░▒██▒▓██ ▀█ ██▒▒██░▄▄▄░
# ▒██▄█▓▒ ▒▒██▀▀█▄ ▒██ ██░▒██ ▒██ ▒██▄█▓▒ ▒░ ▓██▓ ░ ░██░▓██▒ ▐▌██▒░▓█ ██▓
# ▒██▒ ░ ░░██▓ ▒██▒░ ████▓▒░▒██▒ ░██▒▒██▒ ░ ░ ▒██▒ ░ ░██░▒██░ ▓██░░▒▓███▀▒
# ▒▓▒░ ░ ░░ ▒▓ ░▒▓░░ ▒░▒░▒░ ░ ▒░ ░ ░▒▓▒░ ░ ░ ▒ ░░ ░▓ ░ ▒░ ▒ ▒ ░▒ ▒
# ░▒ ░ ░▒ ░ ▒░ ░ ▒ ▒░ ░ ░ ░░▒ ░ ░ ▒ ░░ ░░ ░ ▒░ ░ ░
# ░░ ░░ ░ ░ ░ ░ ▒ ░ ░ ░░ ░ ▒ ░ ░ ░ ░ ░ ░ ░
# ░ ░ ░ ░ ░ ░ ░
#
#
# This snippet does summarization with the GPT-4 model using Chain of Density Prompting.
# You can read more here:
# https://arxiv.org/pdf/2309.04269.pdf
#
# The script takes a URL as input, fetches the HTML content, and extracts the summary
# from the HTML. The summary is then converted to markdown format and used as the prompt
# for the GPT-4 model. The model generates a summary of the summary, which is then
# printed to the console.
#
# The magic is the prompt template, which is the same as the one used in the paper.
#
# Load environment variables from the .env file
load_dotenv()
# Set OpenAI API key from environment variable
openai.api_key = os.getenv("OPENAI_API_KEY")
# Define token limit for OpenAI API
openai_token_limit = 4000
print("""
______ ____ _____ ______ ___ ________ ___ ______________ _ __
/ __/ / / / |/ / |/ / _ | / _ \/ _/_ / / _ /_ __/ _/ __ \/ |/ /
_\ \/ /_/ / /|_/ / /|_/ / __ |/ , _// / / /_/ __ |/ / _/ // /_/ / /
/___/\____/_/ /_/_/ /_/_/ |_/_/|_/___/ /___/_/ |_/_/ /___/\____/_/|_/
""")
# Accept URL from user
summary_url = input("Enter a URL: ")
# Fetch webpage content
response = requests.get(summary_url)
# Extract summary from HTML content
doc = Document(response.content)
# Convert HTML summary to markdown format
markdown_content = html2text(doc.summary())
# Define prompt template. This is the magical prompt from the paper
prompt = """
Article: {}
You will generate increasingly concise, entity-dense summaries of the above article.
Repeat the following 2 steps 5 times.
Step 1. Identify 1-3 informative entities (";" delimited) from the article which are missing from the previously generated summary.
Step 2. Write a new, denser summary of identical length which covers every entity and detail from the previous summary plus the missing entities.
A missing entity is:
- relevant to the main story,
- specific yet concise (5 words or fewer),
- novel (not in the previous summary),
- faithful (present in the article),
- anywhere (can be located anywhere in the article).
Guidelines:
- The first summary should be long (4-5 sentences, ~80 words) yet highly non-specific, containing little information beyond the entities marked as missing. Use overly verbose language and fillers (e.g., "this article discusses") to reach ~80 words.
- Make every word count: rewrite the previous summary to improve flow and make space for additional entities.
- Make space with fusion, compression, and removal of uninformative phrases like "the article discusses".
- The summaries should become highly dense and concise yet self-contained, i.e., easily understood without the article.
- Missing entities can appear anywhere in the new summary.
- Never drop entities from the previous summary. If space cannot be made, add fewer new entities.
Remember, use the exact same number of words for each summary.
Answer in JSON. The JSON should be a list (length 5) of dictionaries whose keys are "Missing_Entities" and "Denser_Summary".
"""
# Initialize tokenizer and count tokens in markdown content
encoding = tiktoken.get_encoding("cl100k_base")
num_tokens = len(encoding.encode(markdown_content))
# Print token count
print(f"\nNum Tokens (content and prompt): {num_tokens}")
# Check if token count exceeds OpenAI limit
if num_tokens > openai_token_limit:
# Truncate text to fit within token limit
markdown_content = encoding.decode(encoding.encode(markdown_content)[:openai_token_limit])
# Format the prompt with the markdown content
summaryPrompt = prompt.format(markdown_content)
print("Generating summary...\n")
# Initialize the OpenAI API call
try:
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{"role": "user", "content": summaryPrompt}
]
)
except Exception as e: # Log exception if API call fails
logging.exception("API call failed")
# Extract the summary from the API response
output = response.choices[0]["message"]['content']
# Attempt to parse the summary as JSON and extract 'Denser_Summary'
try:
output = json.loads(output)
output = output[-1]['Denser_Summary']
except json.JSONDecodeError: # Log exception if JSON parsing fails
logging.exception("JSON decoding failed")
# Print the final summary
print(f"\nSummary:\n{output}\n\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment