Skip to content

Instantly share code, notes, and snippets.

@greg-randall
Last active April 19, 2024 18:37
Show Gist options
  • Save greg-randall/926a1e586c207c2dc766cec1fb7369af to your computer and use it in GitHub Desktop.
Save greg-randall/926a1e586c207c2dc766cec1fb7369af to your computer and use it in GitHub Desktop.
Generate a headline, keywords, based on some values if you should read the article or not, plus a summary of a given article.
# run like:
# python3 llm_page.py "https://www.cnn.com/2023/12/09/business/cosmcs-mcdonalds-drinks/index.html"
# you can optionally pass in a debug flag as the second argument, like:
# python3 llm_page.py "https://www.cnn.com/2023/12/09/business/cosmcs-mcdonalds-drinks/index.html" True
import json
import requests
from termcolor import cprint
from collections import OrderedDict
from readability import Document
from bs4 import BeautifulSoup, NavigableString
from pyppeteer import launch
from pyppeteer_stealth import stealth
import asyncio
import time
import re
import sys
import logging
import textwrap
from pprint import pprint
from unidecode import unidecode
logging.getLogger().setLevel(logging.CRITICAL) #hiding errors that occur when pyppeteer crashes
############# get just the actual important text from a webapge #############
def get_page(url,debug=False):
#get the page's raw html. using pyppeteer because it will render all the page content including javascript,
#plus pyppeteer gets around lots of things that would otherwise block requests, like cloudflare, etc.
async def get_raw_page(): #startin up the pyppeteer
browser = await launch()
page = await browser.newPage()
await stealth(page)
await page.goto(url, timeout=20000) # set the timeout to 20 seconds, just in case something goes wrong
raw_page = await page.content()
await browser.close()
return raw_page
#try and get the page
try:
raw_page = asyncio.get_event_loop().run_until_complete(get_raw_page()) #get the page
except Exception as e:
cprint(f"A pyppeteer error occurred, attempting backup page pull.",'red')
raw_page = requests.get(url).text #if something goes wrong with pyppeteer, just use requests as a fallback.
if len(raw_page) <= 100:
cprint(f"Something went wrong, the page is too short to be valid.",'magenta')
return False
else:
cprint(f"Successfully pulled page content using backup method.\n",'red') # the backup method worked, continue
############# get just the article part of the html and clean it #############
#create a dcoument object from the raw html, and then get the summary version of the page, which is the main content of the page
doc = Document(raw_page)
html = doc.summary()
# Create a Beautiful Soup object
soup = BeautifulSoup(html, 'html.parser')
# Remove all tags with no content or only whitespace
for tag in soup.find_all(True):
if not tag.contents or (tag.string and tag.string.strip() == ''):
tag.extract()
#add linebreaksafter certain tags
for tag_name in ['ol', 'ul', 'li', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
for tag in soup.find_all(tag_name):
tag.insert_after(NavigableString('\n\n'))
#Strip HTML tags
clean_text = soup.get_text()
#remove extra linebreaks
clean_text = re.sub('\s*\n+\s*', '\n\n', clean_text)
#unsmarten quotes
clean_text = re.sub(r"[‘’]", "'", clean_text) # Replace smart single quotes with standard single quotes
clean_text = re.sub(r"[“”]", '"', clean_text) # Replace smart double quotes with standard double quotes
clean_text = re.sub(r"'", "'", clean_text) # Replace double quotes with single quote, helps on json output, since in json double quotes need to be escaped
#remove extra spaces at the start and end of the text
clean_text = clean_text.strip()
if debug:
cprint(f"Cleaned text:",'red')
paragraphs = clean_text.split('\n\n')
wrapped_paragraphs = [textwrap.fill(p, width=80, subsequent_indent='\t', initial_indent='\t') for p in paragraphs]
wrapped_value = '\n\n'.join(wrapped_paragraphs)
cprint(f"{wrapped_value}\n",'yellow')
#clean_text = json.dumps(clean_text)
ascii_text = unidecode(clean_text)
return ascii_text
#run the ollama request
def ollama(prompt,debug=False):
data = {
"model": "llama3",
"prompt": prompt,
"stream": False,
"format": "json",
}
try:
response = requests.post('http://localhost:11434/api/generate', data=json.dumps(data), timeout=60)
response_data = response.json()
output=response_data['response'].strip()
json_output = json.loads(output, object_pairs_hook=OrderedDict)
if debug:
pprint(json_output)
except Exception as e:
cprint(f"Ollama Error:\n\t{e}","red")
return False
return json_output
#little function to indent text
def indent(text):
return textwrap.fill(text, width=80, subsequent_indent=' ',initial_indent=' ')
#get the url from the command line
try:
url = sys.argv[1]
except IndexError:
cprint("You need to pass in a url as an argument.",'magenta')
exit()
#get the debug flag from the command line, we'll make sure it's actually a True/False
try:
debug = bool(sys.argv[2])
if not isinstance(debug, bool):
debug = False
except IndexError:
debug = False
#get the page's text
start_time = time.time()
page = get_page(url,debug)
page_execution_time = round(time.time() - start_time, 3) # convert to milliseconds
if page == False:
cprint("Something went wrong, with the page pull exiting.",'magenta')
exit()
#build the prompt
prompt = f"Read the ARTICLE below.\n"
prompt += f"Write a no-nonsense HEADLINE for the article.\n"
prompt += f"Generate a four sentence SUMMARY for the article focusing on facts, written to sound neutral like National Public Radio (NPR).\n"
prompt += f"Generate five to twenty KEYWORDS, keywords should be one or two words, ignore common keywords and illustrate how this article is different than others, and output fewer rather than more keywords.\n"
prompt += f"The output JSON should have EXACTLY 3 keys HEADLINE, SUMMARY, and KEYWORDS:\n"
prompt += '{"HEADLINE":"...", "SUMMARY":"...", "KEYWORDS":["...", "...", "...", "..."]}\n\n'
prompt += f"\nARTICLE:\n{page}"
if debug:
cprint(prompt,'green')
#run the prompt through ollama
start_time = time.time()
output = ollama(prompt,debug)
ollama_execution_time = round(time.time() - start_time, 3) # convert to milliseconds
if output == False:
cprint("Something went wrong, with the Ollama request exiting.",'magenta')
exit()
#parse and output the results
for key, value in output.items():
if key.upper() == "HEADLINE" or key.upper() == "SUMMARY" or key.upper() == "KEYWORDS":
if isinstance(value, list):
print(f"{key.capitalize()}:")
for item in value:
print(indent(item))
print()
else:
print(f"{key.capitalize()}:\n{indent(value)}\n")
print(f"Execution Time:\n Page Collection - {round(page_execution_time)} seconds\n Ollama Request - {round(ollama_execution_time)} seconds")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment