Skip to content

Instantly share code, notes, and snippets.

@justinledwards
Created April 25, 2024 20:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save justinledwards/68e819c1e6051e6c9bfd77a5cd289ca6 to your computer and use it in GitHub Desktop.
Save justinledwards/68e819c1e6051e6c9bfd77a5cd289ca6 to your computer and use it in GitHub Desktop.
Auto choosing chunked ai summary bot
import os
import subprocess
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize
from playwright.sync_api import sync_playwright
MAX_CONTEXT_SIZE = 8000
OVERLAP_TOKENS = 100
def fetch_html(url):
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto(url)
html = page.content()
title = page.title()
browser.close()
return html, title
def summarize(text, title, max_size=MAX_CONTEXT_SIZE):
prompt = f"summarize the actual content of {title}, max {int( max_size / 4)} words"
process = subprocess.run(['ollama', 'run', 'llama3', prompt],
input=text.encode('utf-8'),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True)
return process.stdout.decode().strip()
def get_chunks(html):
soup = BeautifulSoup(html, 'lxml')
text = soup.get_text()
sentences = sent_tokenize(text)
chunks = []
chunk = []
for sentence in sentences:
tokens = word_tokenize(sentence)
if len(chunk) + len(tokens) <= MAX_CONTEXT_SIZE // 2:
chunk.extend(tokens)
else:
chunks.append(' '.join(chunk))
chunk = tokens
chunks.append(' '.join(chunk))
return chunks
def load_html_from_string(html, title):
chunks = get_chunks(html)
summaries = []
for i, chunk in enumerate(chunks):
overlap = ''
if i > 0:
overlap = ' '.join(chunks[i-1].split()[-OVERLAP_TOKENS:])
summary1 = summarize(f"{overlap} {chunk}", title)
summary2 = summarize(f"{overlap} {chunk}", title, max_size=MAX_CONTEXT_SIZE // 2)
better_summary = evaluate_summaries(summary1, summary2, title)
print(f"Choices for chunk {i+1}:")
print(f"Choice 1")
print(f"--------")
print(f"{summary1}")
print(f"--------")
print(f"Choice 2")
print(f"--------")
print(f"{summary2}")
print(f"Chosen summary: {better_summary}\n")
summaries.append(better_summary)
return summaries
def evaluate_summaries(summary1, summary2, title):
prompt = f"You are a bot that relays the best summary of two choices. Only output the exact content of {title} of the chosen best with no extra text as you are only a part of the larger output: \r\nChoice 1\r\n {summary1} \r\n---\r\nChoice 2\r\n {summary2}"
process = subprocess.run(['ollama', 'run', 'llama3', prompt],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True)
return process.stdout.decode().strip()
def combine_summaries(summaries):
combined_text = ' '.join(summaries)
if len(combined_text) > MAX_CONTEXT_SIZE:
combined_text = summarize(combined_text, '', max_size=MAX_CONTEXT_SIZE // 2)
process = subprocess.run(['ollama', 'run', 'llama3', 'These summaries are the response of a bot choosing the best of 2 sub summaries. Clean up any parts they relayed other than just the content and combine these sub-summaries into a comprehensive summary of actual content in markdown format:'],
input=combined_text.encode('utf-8'),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True)
return process.stdout.decode().strip()
if __name__ == "__main__":
import sys
if len(sys.argv) != 2:
print("Usage: script.py <URL>")
sys.exit(1)
url = sys.argv[1]
html, title = fetch_html(url)
summaries = load_html_from_string(html, title)
comprehensive_summary = combine_summaries(summaries)
print(comprehensive_summary)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment