Last active
April 19, 2024 18:37
-
-
Save greg-randall/926a1e586c207c2dc766cec1fb7369af to your computer and use it in GitHub Desktop.
Generate a headline, keywords, based on some values if you should read the article or not, plus a summary of a given article.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# run like: | |
# python3 llm_page.py "https://www.cnn.com/2023/12/09/business/cosmcs-mcdonalds-drinks/index.html" | |
# you can optionally pass in a debug flag as the second argument, like: | |
# python3 llm_page.py "https://www.cnn.com/2023/12/09/business/cosmcs-mcdonalds-drinks/index.html" True | |
import json | |
import requests | |
from termcolor import cprint | |
from collections import OrderedDict | |
from readability import Document | |
from bs4 import BeautifulSoup, NavigableString | |
from pyppeteer import launch | |
from pyppeteer_stealth import stealth | |
import asyncio | |
import time | |
import re | |
import sys | |
import logging | |
import textwrap | |
from pprint import pprint | |
from unidecode import unidecode | |
logging.getLogger().setLevel(logging.CRITICAL) #hiding errors that occur when pyppeteer crashes | |
############# get just the actual important text from a webapge ############# | |
def get_page(url,debug=False): | |
#get the page's raw html. using pyppeteer because it will render all the page content including javascript, | |
#plus pyppeteer gets around lots of things that would otherwise block requests, like cloudflare, etc. | |
async def get_raw_page(): #startin up the pyppeteer | |
browser = await launch() | |
page = await browser.newPage() | |
await stealth(page) | |
await page.goto(url, timeout=20000) # set the timeout to 20 seconds, just in case something goes wrong | |
raw_page = await page.content() | |
await browser.close() | |
return raw_page | |
#try and get the page | |
try: | |
raw_page = asyncio.get_event_loop().run_until_complete(get_raw_page()) #get the page | |
except Exception as e: | |
cprint(f"A pyppeteer error occurred, attempting backup page pull.",'red') | |
raw_page = requests.get(url).text #if something goes wrong with pyppeteer, just use requests as a fallback. | |
if len(raw_page) <= 100: | |
cprint(f"Something went wrong, the page is too short to be valid.",'magenta') | |
return False | |
else: | |
cprint(f"Successfully pulled page content using backup method.\n",'red') # the backup method worked, continue | |
############# get just the article part of the html and clean it ############# | |
#create a dcoument object from the raw html, and then get the summary version of the page, which is the main content of the page | |
doc = Document(raw_page) | |
html = doc.summary() | |
# Create a Beautiful Soup object | |
soup = BeautifulSoup(html, 'html.parser') | |
# Remove all tags with no content or only whitespace | |
for tag in soup.find_all(True): | |
if not tag.contents or (tag.string and tag.string.strip() == ''): | |
tag.extract() | |
#add linebreaksafter certain tags | |
for tag_name in ['ol', 'ul', 'li', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']: | |
for tag in soup.find_all(tag_name): | |
tag.insert_after(NavigableString('\n\n')) | |
#Strip HTML tags | |
clean_text = soup.get_text() | |
#remove extra linebreaks | |
clean_text = re.sub('\s*\n+\s*', '\n\n', clean_text) | |
#unsmarten quotes | |
clean_text = re.sub(r"[‘’]", "'", clean_text) # Replace smart single quotes with standard single quotes | |
clean_text = re.sub(r"[“”]", '"', clean_text) # Replace smart double quotes with standard double quotes | |
clean_text = re.sub(r"'", "'", clean_text) # Replace double quotes with single quote, helps on json output, since in json double quotes need to be escaped | |
#remove extra spaces at the start and end of the text | |
clean_text = clean_text.strip() | |
if debug: | |
cprint(f"Cleaned text:",'red') | |
paragraphs = clean_text.split('\n\n') | |
wrapped_paragraphs = [textwrap.fill(p, width=80, subsequent_indent='\t', initial_indent='\t') for p in paragraphs] | |
wrapped_value = '\n\n'.join(wrapped_paragraphs) | |
cprint(f"{wrapped_value}\n",'yellow') | |
#clean_text = json.dumps(clean_text) | |
ascii_text = unidecode(clean_text) | |
return ascii_text | |
#run the ollama request | |
def ollama(prompt,debug=False): | |
data = { | |
"model": "llama3", | |
"prompt": prompt, | |
"stream": False, | |
"format": "json", | |
} | |
try: | |
response = requests.post('http://localhost:11434/api/generate', data=json.dumps(data), timeout=60) | |
response_data = response.json() | |
output=response_data['response'].strip() | |
json_output = json.loads(output, object_pairs_hook=OrderedDict) | |
if debug: | |
pprint(json_output) | |
except Exception as e: | |
cprint(f"Ollama Error:\n\t{e}","red") | |
return False | |
return json_output | |
#little function to indent text | |
def indent(text): | |
return textwrap.fill(text, width=80, subsequent_indent=' ',initial_indent=' ') | |
#get the url from the command line | |
try: | |
url = sys.argv[1] | |
except IndexError: | |
cprint("You need to pass in a url as an argument.",'magenta') | |
exit() | |
#get the debug flag from the command line, we'll make sure it's actually a True/False | |
try: | |
debug = bool(sys.argv[2]) | |
if not isinstance(debug, bool): | |
debug = False | |
except IndexError: | |
debug = False | |
#get the page's text | |
start_time = time.time() | |
page = get_page(url,debug) | |
page_execution_time = round(time.time() - start_time, 3) # convert to milliseconds | |
if page == False: | |
cprint("Something went wrong, with the page pull exiting.",'magenta') | |
exit() | |
#build the prompt | |
prompt = f"Read the ARTICLE below.\n" | |
prompt += f"Write a no-nonsense HEADLINE for the article.\n" | |
prompt += f"Generate a four sentence SUMMARY for the article focusing on facts, written to sound neutral like National Public Radio (NPR).\n" | |
prompt += f"Generate five to twenty KEYWORDS, keywords should be one or two words, ignore common keywords and illustrate how this article is different than others, and output fewer rather than more keywords.\n" | |
prompt += f"The output JSON should have EXACTLY 3 keys HEADLINE, SUMMARY, and KEYWORDS:\n" | |
prompt += '{"HEADLINE":"...", "SUMMARY":"...", "KEYWORDS":["...", "...", "...", "..."]}\n\n' | |
prompt += f"\nARTICLE:\n{page}" | |
if debug: | |
cprint(prompt,'green') | |
#run the prompt through ollama | |
start_time = time.time() | |
output = ollama(prompt,debug) | |
ollama_execution_time = round(time.time() - start_time, 3) # convert to milliseconds | |
if output == False: | |
cprint("Something went wrong, with the Ollama request exiting.",'magenta') | |
exit() | |
#parse and output the results | |
for key, value in output.items(): | |
if key.upper() == "HEADLINE" or key.upper() == "SUMMARY" or key.upper() == "KEYWORDS": | |
if isinstance(value, list): | |
print(f"{key.capitalize()}:") | |
for item in value: | |
print(indent(item)) | |
print() | |
else: | |
print(f"{key.capitalize()}:\n{indent(value)}\n") | |
print(f"Execution Time:\n Page Collection - {round(page_execution_time)} seconds\n Ollama Request - {round(ollama_execution_time)} seconds") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment