harperreed/chain_summarization.py

## chain_summarization.py
# Import required packages
from dotenv import load_dotenv       # For managing environment variables
from html2text import html2text      # For HTML to markdown conversion
from readability import Document     # For summarizing HTML content
from typing import List              # For type hinting
import json                          # For JSON parsing
import logging                       # For logging errors
import openai                        # OpenAI GPT API
import os                            # For OS-level operations
import requests                      # For HTTP requests
import tiktoken                      # For token counting


#
#  ▄████▄   ██░ ██  ▄▄▄       ██▓ ███▄    █     ▒█████    █████▒
# ▒██▀ ▀█  ▓██░ ██▒▒████▄    ▓██▒ ██ ▀█   █    ▒██▒  ██▒▓██   ▒
# ▒▓█    ▄ ▒██▀▀██░▒██  ▀█▄  ▒██▒▓██  ▀█ ██▒   ▒██░  ██▒▒████ ░
# ▒▓▓▄ ▄██▒░▓█ ░██ ░██▄▄▄▄██ ░██░▓██▒  ▐▌██▒   ▒██   ██░░▓█▒  ░
# ▒ ▓███▀ ░░▓█▒░██▓ ▓█   ▓██▒░██░▒██░   ▓██░   ░ ████▓▒░░▒█░
# ░ ░▒ ▒  ░ ▒ ░░▒░▒ ▒▒   ▓▒█░░▓  ░ ▒░   ▒ ▒    ░ ▒░▒░▒░  ▒ ░
#   ░  ▒    ▒ ░▒░ ░  ▒   ▒▒ ░ ▒ ░░ ░░   ░ ▒░     ░ ▒ ▒░  ░
# ░         ░  ░░ ░  ░   ▒    ▒ ░   ░   ░ ░    ░ ░ ░ ▒   ░ ░
# ░ ░       ░  ░  ░      ░  ░ ░           ░        ░ ░
# ░
# ▓█████▄ ▓█████  ███▄    █   ██████  ██▓▄▄▄█████▓▓██   ██▓
# ▒██▀ ██▌▓█   ▀  ██ ▀█   █ ▒██    ▒ ▓██▒▓  ██▒ ▓▒ ▒██  ██▒
# ░██   █▌▒███   ▓██  ▀█ ██▒░ ▓██▄   ▒██▒▒ ▓██░ ▒░  ▒██ ██░
# ░▓█▄   ▌▒▓█  ▄ ▓██▒  ▐▌██▒  ▒   ██▒░██░░ ▓██▓ ░   ░ ▐██▓░
# ░▒████▓ ░▒████▒▒██░   ▓██░▒██████▒▒░██░  ▒██▒ ░   ░ ██▒▓░
#  ▒▒▓  ▒ ░░ ▒░ ░░ ▒░   ▒ ▒ ▒ ▒▓▒ ▒ ░░▓    ▒ ░░      ██▒▒▒
#  ░ ▒  ▒  ░ ░  ░░ ░░   ░ ▒░░ ░▒  ░ ░ ▒ ░    ░     ▓██ ░▒░
#  ░ ░  ░    ░      ░   ░ ░ ░  ░  ░   ▒ ░  ░       ▒ ▒ ░░
#    ░       ░  ░         ░       ░   ░            ░ ░
#  ░                                               ░ ░
#  ██▓███   ██▀███   ▒█████   ███▄ ▄███▓ ██▓███  ▄▄▄█████▓ ██▓ ███▄    █   ▄████
# ▓██░  ██▒▓██ ▒ ██▒▒██▒  ██▒▓██▒▀█▀ ██▒▓██░  ██▒▓  ██▒ ▓▒▓██▒ ██ ▀█   █  ██▒ ▀█▒
# ▓██░ ██▓▒▓██ ░▄█ ▒▒██░  ██▒▓██    ▓██░▓██░ ██▓▒▒ ▓██░ ▒░▒██▒▓██  ▀█ ██▒▒██░▄▄▄░
# ▒██▄█▓▒ ▒▒██▀▀█▄  ▒██   ██░▒██    ▒██ ▒██▄█▓▒ ▒░ ▓██▓ ░ ░██░▓██▒  ▐▌██▒░▓█  ██▓
# ▒██▒ ░  ░░██▓ ▒██▒░ ████▓▒░▒██▒   ░██▒▒██▒ ░  ░  ▒██▒ ░ ░██░▒██░   ▓██░░▒▓███▀▒
# ▒▓▒░ ░  ░░ ▒▓ ░▒▓░░ ▒░▒░▒░ ░ ▒░   ░  ░▒▓▒░ ░  ░  ▒ ░░   ░▓  ░ ▒░   ▒ ▒  ░▒   ▒
# ░▒ ░       ░▒ ░ ▒░  ░ ▒ ▒░ ░  ░      ░░▒ ░         ░     ▒ ░░ ░░   ░ ▒░  ░   ░
# ░░         ░░   ░ ░ ░ ░ ▒  ░      ░   ░░         ░       ▒ ░   ░   ░ ░ ░ ░   ░
#             ░         ░ ░         ░                      ░           ░       ░
#

#
# This snippet does summarization with the GPT-4 model using Chain of Density Prompting.
# You can read more here:
# https://arxiv.org/pdf/2309.04269.pdf
#
# The script takes a URL as input, fetches the HTML content, and extracts the summary
# from the HTML. The summary is then converted to markdown format and used as the prompt
# for the GPT-4 model. The model generates a summary of the summary, which is then
# printed to the console.
#
# The magic is the prompt template, which is the same as the one used in the paper.
#

# Load environment variables from the .env file
load_dotenv()

# Set OpenAI API key from environment variable
openai.api_key = os.getenv("OPENAI_API_KEY")

# Define token limit for OpenAI API
openai_token_limit = 4000

print("""
   ______  ____  _____  ______   ___  ________  ___ ______________  _  __
  / __/ / / /  |/  /  |/  / _ | / _ \/  _/_  / / _ /_  __/  _/ __ \/ |/ /
 _\ \/ /_/ / /|_/ / /|_/ / __ |/ , _// /  / /_/ __ |/ / _/ // /_/ /    /
/___/\____/_/  /_/_/  /_/_/ |_/_/|_/___/ /___/_/ |_/_/ /___/\____/_/|_/
      """)


# Accept URL from user
summary_url = input("Enter a URL: ")

# Fetch webpage content
response = requests.get(summary_url)

# Extract summary from HTML content
doc = Document(response.content)

# Convert HTML summary to markdown format
markdown_content = html2text(doc.summary())

# Define prompt template. This is the magical prompt from the paper
prompt = """
Article: {}
You will generate increasingly concise, entity-dense summaries of the above article.

Repeat the following 2 steps 5 times.

Step 1. Identify 1-3 informative entities (";" delimited) from the article which are missing from the previously generated summary.
Step 2. Write a new, denser summary of identical length which covers every entity and detail from the previous summary plus the missing entities.

A missing entity is:
- relevant to the main story,
- specific yet concise (5 words or fewer),
- novel (not in the previous summary),
- faithful (present in the article),
- anywhere (can be located anywhere in the article).

Guidelines:

- The first summary should be long (4-5 sentences, ~80 words) yet highly non-specific, containing little information beyond the entities marked as missing. Use overly verbose language and fillers (e.g., "this article discusses") to reach ~80 words.
- Make every word count: rewrite the previous summary to improve flow and make space for additional entities.
- Make space with fusion, compression, and removal of uninformative phrases like "the article discusses".
- The summaries should become highly dense and concise yet self-contained, i.e., easily understood without the article.
- Missing entities can appear anywhere in the new summary.
- Never drop entities from the previous summary. If space cannot be made, add fewer new entities.

Remember, use the exact same number of words for each summary.
Answer in JSON. The JSON should be a list (length 5) of dictionaries whose keys are "Missing_Entities" and "Denser_Summary".
"""

# Initialize tokenizer and count tokens in markdown content
encoding = tiktoken.get_encoding("cl100k_base")
num_tokens = len(encoding.encode(markdown_content))

# Print token count
print(f"\nNum Tokens (content and prompt): {num_tokens}")

# Check if token count exceeds OpenAI limit
if num_tokens > openai_token_limit:
  # Truncate text to fit within token limit
  markdown_content = encoding.decode(encoding.encode(markdown_content)[:openai_token_limit])

# Format the prompt with the markdown content
summaryPrompt = prompt.format(markdown_content)

print("Generating summary...\n")
# Initialize the OpenAI API call
try:
  response = openai.ChatCompletion.create(
    model="gpt-4",
    messages=[
      {"role": "user", "content": summaryPrompt}
    ]
  )
except Exception as e:  # Log exception if API call fails
  logging.exception("API call failed")

# Extract the summary from the API response
output = response.choices[0]["message"]['content']

# Attempt to parse the summary as JSON and extract 'Denser_Summary'
try:
  output = json.loads(output)
  output = output[-1]['Denser_Summary']
except json.JSONDecodeError:  # Log exception if JSON parsing fails
  logging.exception("JSON decoding failed")

# Print the final summary
print(f"\nSummary:\n{output}\n\n")
	# Import required packages
	from dotenv import load_dotenv # For managing environment variables
	from html2text import html2text # For HTML to markdown conversion
	from readability import Document # For summarizing HTML content
	from typing import List # For type hinting
	import json # For JSON parsing
	import logging # For logging errors
	import openai # OpenAI GPT API
	import os # For OS-level operations
	import requests # For HTTP requests
	import tiktoken # For token counting


	#
	# ▄████▄ ██░ ██ ▄▄▄ ██▓ ███▄ █ ▒█████ █████▒
	# ▒██▀ ▀█ ▓██░ ██▒▒████▄ ▓██▒ ██ ▀█ █ ▒██▒ ██▒▓██ ▒
	# ▒▓█ ▄ ▒██▀▀██░▒██ ▀█▄ ▒██▒▓██ ▀█ ██▒ ▒██░ ██▒▒████ ░
	# ▒▓▓▄ ▄██▒░▓█ ░██ ░██▄▄▄▄██ ░██░▓██▒ ▐▌██▒ ▒██ ██░░▓█▒ ░
	# ▒ ▓███▀ ░░▓█▒░██▓ ▓█ ▓██▒░██░▒██░ ▓██░ ░ ████▓▒░░▒█░
	# ░ ░▒ ▒ ░ ▒ ░░▒░▒ ▒▒ ▓▒█░░▓ ░ ▒░ ▒ ▒ ░ ▒░▒░▒░ ▒ ░
	# ░ ▒ ▒ ░▒░ ░ ▒ ▒▒ ░ ▒ ░░ ░░ ░ ▒░ ░ ▒ ▒░ ░
	# ░ ░ ░░ ░ ░ ▒ ▒ ░ ░ ░ ░ ░ ░ ░ ▒ ░ ░
	# ░ ░ ░ ░ ░ ░ ░ ░ ░ ░ ░
	# ░
	# ▓█████▄ ▓█████ ███▄ █ ██████ ██▓▄▄▄█████▓▓██ ██▓
	# ▒██▀ ██▌▓█ ▀ ██ ▀█ █ ▒██ ▒ ▓██▒▓ ██▒ ▓▒ ▒██ ██▒
	# ░██ █▌▒███ ▓██ ▀█ ██▒░ ▓██▄ ▒██▒▒ ▓██░ ▒░ ▒██ ██░
	# ░▓█▄ ▌▒▓█ ▄ ▓██▒ ▐▌██▒ ▒ ██▒░██░░ ▓██▓ ░ ░ ▐██▓░
	# ░▒████▓ ░▒████▒▒██░ ▓██░▒██████▒▒░██░ ▒██▒ ░ ░ ██▒▓░
	# ▒▒▓ ▒ ░░ ▒░ ░░ ▒░ ▒ ▒ ▒ ▒▓▒ ▒ ░░▓ ▒ ░░ ██▒▒▒
	# ░ ▒ ▒ ░ ░ ░░ ░░ ░ ▒░░ ░▒ ░ ░ ▒ ░ ░ ▓██ ░▒░
	# ░ ░ ░ ░ ░ ░ ░ ░ ░ ░ ▒ ░ ░ ▒ ▒ ░░
	# ░ ░ ░ ░ ░ ░ ░ ░
	# ░ ░ ░
	# ██▓███ ██▀███ ▒█████ ███▄ ▄███▓ ██▓███ ▄▄▄█████▓ ██▓ ███▄ █ ▄████
	# ▓██░ ██▒▓██ ▒ ██▒▒██▒ ██▒▓██▒▀█▀ ██▒▓██░ ██▒▓ ██▒ ▓▒▓██▒ ██ ▀█ █ ██▒ ▀█▒
	# ▓██░ ██▓▒▓██ ░▄█ ▒▒██░ ██▒▓██ ▓██░▓██░ ██▓▒▒ ▓██░ ▒░▒██▒▓██ ▀█ ██▒▒██░▄▄▄░
	# ▒██▄█▓▒ ▒▒██▀▀█▄ ▒██ ██░▒██ ▒██ ▒██▄█▓▒ ▒░ ▓██▓ ░ ░██░▓██▒ ▐▌██▒░▓█ ██▓
	# ▒██▒ ░ ░░██▓ ▒██▒░ ████▓▒░▒██▒ ░██▒▒██▒ ░ ░ ▒██▒ ░ ░██░▒██░ ▓██░░▒▓███▀▒
	# ▒▓▒░ ░ ░░ ▒▓ ░▒▓░░ ▒░▒░▒░ ░ ▒░ ░ ░▒▓▒░ ░ ░ ▒ ░░ ░▓ ░ ▒░ ▒ ▒ ░▒ ▒
	# ░▒ ░ ░▒ ░ ▒░ ░ ▒ ▒░ ░ ░ ░░▒ ░ ░ ▒ ░░ ░░ ░ ▒░ ░ ░
	# ░░ ░░ ░ ░ ░ ░ ▒ ░ ░ ░░ ░ ▒ ░ ░ ░ ░ ░ ░ ░
	# ░ ░ ░ ░ ░ ░ ░
	#

	#
	# This snippet does summarization with the GPT-4 model using Chain of Density Prompting.
	# You can read more here:
	# https://arxiv.org/pdf/2309.04269.pdf
	#
	# The script takes a URL as input, fetches the HTML content, and extracts the summary
	# from the HTML. The summary is then converted to markdown format and used as the prompt
	# for the GPT-4 model. The model generates a summary of the summary, which is then
	# printed to the console.
	#
	# The magic is the prompt template, which is the same as the one used in the paper.
	#

	# Load environment variables from the .env file
	load_dotenv()

	# Set OpenAI API key from environment variable
	openai.api_key = os.getenv("OPENAI_API_KEY")

	# Define token limit for OpenAI API
	openai_token_limit = 4000

	print("""
	______ ____ _____ ______ ___ ________ ___ ______________ _ __
	/ __/ / / / \|/ / \|/ / _ \| / _ \/ _/_ / / _ /_ __/ _/ __ \/ \|/ /
	_\ \/ /_/ / /\|_/ / /\|_/ / __ \|/ , _// / / /_/ __ \|/ / _/ // /_/ / /
	/___/\____/_/ /_/_/ /_/_/ \|_/_/\|_/___/ /___/_/ \|_/_/ /___/\____/_/\|_/
	""")


	# Accept URL from user
	summary_url = input("Enter a URL: ")

	# Fetch webpage content
	response = requests.get(summary_url)

	# Extract summary from HTML content
	doc = Document(response.content)

	# Convert HTML summary to markdown format
	markdown_content = html2text(doc.summary())

	# Define prompt template. This is the magical prompt from the paper
	prompt = """
	Article: {}
	You will generate increasingly concise, entity-dense summaries of the above article.

	Repeat the following 2 steps 5 times.

	Step 1. Identify 1-3 informative entities (";" delimited) from the article which are missing from the previously generated summary.
	Step 2. Write a new, denser summary of identical length which covers every entity and detail from the previous summary plus the missing entities.

	A missing entity is:
	- relevant to the main story,
	- specific yet concise (5 words or fewer),
	- novel (not in the previous summary),
	- faithful (present in the article),
	- anywhere (can be located anywhere in the article).

	Guidelines:

	- The first summary should be long (4-5 sentences, ~80 words) yet highly non-specific, containing little information beyond the entities marked as missing. Use overly verbose language and fillers (e.g., "this article discusses") to reach ~80 words.
	- Make every word count: rewrite the previous summary to improve flow and make space for additional entities.
	- Make space with fusion, compression, and removal of uninformative phrases like "the article discusses".
	- The summaries should become highly dense and concise yet self-contained, i.e., easily understood without the article.
	- Missing entities can appear anywhere in the new summary.
	- Never drop entities from the previous summary. If space cannot be made, add fewer new entities.

	Remember, use the exact same number of words for each summary.
	Answer in JSON. The JSON should be a list (length 5) of dictionaries whose keys are "Missing_Entities" and "Denser_Summary".
	"""

	# Initialize tokenizer and count tokens in markdown content
	encoding = tiktoken.get_encoding("cl100k_base")
	num_tokens = len(encoding.encode(markdown_content))

	# Print token count
	print(f"\nNum Tokens (content and prompt): {num_tokens}")

	# Check if token count exceeds OpenAI limit
	if num_tokens > openai_token_limit:
	# Truncate text to fit within token limit
	markdown_content = encoding.decode(encoding.encode(markdown_content)[:openai_token_limit])

	# Format the prompt with the markdown content
	summaryPrompt = prompt.format(markdown_content)

	print("Generating summary...\n")
	# Initialize the OpenAI API call
	try:
	response = openai.ChatCompletion.create(
	model="gpt-4",
	messages=[
	{"role": "user", "content": summaryPrompt}
	]
	)
	except Exception as e: # Log exception if API call fails
	logging.exception("API call failed")

	# Extract the summary from the API response
	output = response.choices[0]["message"]['content']

	# Attempt to parse the summary as JSON and extract 'Denser_Summary'
	try:
	output = json.loads(output)
	output = output[-1]['Denser_Summary']
	except json.JSONDecodeError: # Log exception if JSON parsing fails
	logging.exception("JSON decoding failed")

	# Print the final summary
	print(f"\nSummary:\n{output}\n\n")