greg-randall/llm_page.py

## llm_page.py
# run like:
# python3 llm_page.py "https://www.cnn.com/2023/12/09/business/cosmcs-mcdonalds-drinks/index.html"
# you can optionally pass in a debug flag as the second argument, like:
# python3 llm_page.py "https://www.cnn.com/2023/12/09/business/cosmcs-mcdonalds-drinks/index.html" True

import json
import requests
from termcolor import cprint
from collections import OrderedDict
from readability import Document
from bs4 import BeautifulSoup, NavigableString
from pyppeteer import launch
from pyppeteer_stealth import stealth
import asyncio
import time
import re
import sys
import logging
import textwrap
from pprint import pprint
from unidecode import unidecode


logging.getLogger().setLevel(logging.CRITICAL) #hiding errors that occur when pyppeteer crashes


############# get just the actual important text from a webapge  #############
def get_page(url,debug=False):

    #get the page's raw html. using pyppeteer because it will render all the page content including javascript,
    #plus pyppeteer gets around lots of things that would otherwise block requests, like cloudflare, etc.
    async def get_raw_page(): #startin up the pyppeteer
        browser = await launch()
        page = await browser.newPage()
        await stealth(page)
        await page.goto(url, timeout=20000)  # set the timeout to 20 seconds, just in case something goes wrong
        raw_page = await page.content()
        await browser.close()
        return raw_page

    #try and get the page
    try:
        raw_page = asyncio.get_event_loop().run_until_complete(get_raw_page()) #get the page
    except Exception as e:
        cprint(f"A pyppeteer error occurred, attempting backup page pull.",'red')
        raw_page = requests.get(url).text #if something goes wrong with pyppeteer, just use requests as a fallback.
        if len(raw_page) <= 100:
            cprint(f"Something went wrong, the page is too short to be valid.",'magenta')
            return False
        else:
            cprint(f"Successfully pulled page content using backup method.\n",'red') # the backup method worked, continue


    ############# get just the article part of the html and clean it #############
    #create a dcoument object from the raw html, and then get the summary version of the page, which is the main content of the page
    doc = Document(raw_page)
    html = doc.summary()

    # Create a Beautiful Soup object
    soup = BeautifulSoup(html, 'html.parser')

    # Remove all tags with no content or only whitespace
    for tag in soup.find_all(True):
        if not tag.contents or (tag.string and tag.string.strip() == ''):
            tag.extract()

    #add linebreaksafter certain tags
    for tag_name in ['ol', 'ul', 'li', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
        for tag in soup.find_all(tag_name):
            tag.insert_after(NavigableString('\n\n'))

    #Strip HTML tags
    clean_text = soup.get_text()

    #remove extra linebreaks
    clean_text = re.sub('\s*\n+\s*', '\n\n', clean_text)

    #unsmarten quotes
    clean_text = re.sub(r"[‘’]", "'", clean_text)  # Replace smart single quotes with standard single quotes
    clean_text = re.sub(r"[“”]", '"', clean_text)  # Replace smart double quotes with standard double quotes
    clean_text = re.sub(r"'", "'", clean_text)  # Replace double quotes with single quote, helps on json output, since in json double quotes need to be escaped

    #remove extra spaces at the start and end of the text
    clean_text = clean_text.strip()

    if debug:
        cprint(f"Cleaned text:",'red')
        paragraphs = clean_text.split('\n\n')
        wrapped_paragraphs = [textwrap.fill(p, width=80, subsequent_indent='\t', initial_indent='\t') for p in paragraphs]
        wrapped_value = '\n\n'.join(wrapped_paragraphs)
        cprint(f"{wrapped_value}\n",'yellow')

    #clean_text = json.dumps(clean_text)

    ascii_text = unidecode(clean_text)


    return ascii_text


#run the ollama request
def ollama(prompt,debug=False):

    data = {
        "model": "llama3",
        "prompt": prompt,
        "stream": False,
        "format": "json",
    }
    try:
        response = requests.post('http://localhost:11434/api/generate', data=json.dumps(data), timeout=60)
        response_data = response.json()
        output=response_data['response'].strip()
        json_output = json.loads(output, object_pairs_hook=OrderedDict)
        if debug:
            pprint(json_output)

    except Exception as e:
        cprint(f"Ollama Error:\n\t{e}","red")
        return False

    return json_output


#little function to indent text
def indent(text):
    return textwrap.fill(text, width=80, subsequent_indent='   ',initial_indent='   ')


#get the url from the command line
try:
    url = sys.argv[1]
except IndexError:
    cprint("You need to pass in a url as an argument.",'magenta')
    exit()

#get the debug flag from the command line, we'll make sure it's actually a True/False
try:
    debug = bool(sys.argv[2])
    if not isinstance(debug, bool):
        debug = False
except IndexError:
    debug = False


#get the page's text
start_time = time.time()
page = get_page(url,debug)
page_execution_time = round(time.time() - start_time, 3)  # convert to milliseconds

if page == False:
    cprint("Something went wrong, with the page pull exiting.",'magenta')
    exit()


#build the prompt
prompt  = f"Read the ARTICLE below.\n"
prompt += f"Write a no-nonsense HEADLINE for the article.\n"
prompt += f"Generate a four sentence SUMMARY for the article focusing on facts, written to sound neutral like National Public Radio (NPR).\n"
prompt += f"Generate five to twenty KEYWORDS, keywords should be one or two words, ignore common keywords and illustrate how this article is different than others, and output fewer rather than more keywords.\n"
prompt += f"The output JSON should have EXACTLY 3 keys HEADLINE, SUMMARY, and KEYWORDS:\n"
prompt += '{"HEADLINE":"...", "SUMMARY":"...", "KEYWORDS":["...", "...", "...", "..."]}\n\n'
prompt += f"\nARTICLE:\n{page}"

if debug:
    cprint(prompt,'green')


#run the prompt through ollama
start_time = time.time()
output = ollama(prompt,debug)
ollama_execution_time = round(time.time() - start_time, 3)  # convert to milliseconds

if output == False:
    cprint("Something went wrong, with the Ollama request exiting.",'magenta')
    exit()


#parse and output the results
for key, value in output.items():
    if key.upper() == "HEADLINE" or  key.upper() == "SUMMARY" or  key.upper() == "KEYWORDS":
        if isinstance(value, list):
            print(f"{key.capitalize()}:")
            for item in value:
                print(indent(item))
            print()
        else:
            print(f"{key.capitalize()}:\n{indent(value)}\n")

print(f"Execution Time:\n   Page Collection - {round(page_execution_time)} seconds\n   Ollama Request  - {round(ollama_execution_time)} seconds")
	# run like:
	# python3 llm_page.py "https://www.cnn.com/2023/12/09/business/cosmcs-mcdonalds-drinks/index.html"
	# you can optionally pass in a debug flag as the second argument, like:
	# python3 llm_page.py "https://www.cnn.com/2023/12/09/business/cosmcs-mcdonalds-drinks/index.html" True

	import json
	import requests
	from termcolor import cprint
	from collections import OrderedDict
	from readability import Document
	from bs4 import BeautifulSoup, NavigableString
	from pyppeteer import launch
	from pyppeteer_stealth import stealth
	import asyncio
	import time
	import re
	import sys
	import logging
	import textwrap
	from pprint import pprint
	from unidecode import unidecode



	logging.getLogger().setLevel(logging.CRITICAL) #hiding errors that occur when pyppeteer crashes


	############# get just the actual important text from a webapge #############
	def get_page(url,debug=False):

	#get the page's raw html. using pyppeteer because it will render all the page content including javascript,
	#plus pyppeteer gets around lots of things that would otherwise block requests, like cloudflare, etc.
	async def get_raw_page(): #startin up the pyppeteer
	browser = await launch()
	page = await browser.newPage()
	await stealth(page)
	await page.goto(url, timeout=20000) # set the timeout to 20 seconds, just in case something goes wrong
	raw_page = await page.content()
	await browser.close()
	return raw_page

	#try and get the page
	try:
	raw_page = asyncio.get_event_loop().run_until_complete(get_raw_page()) #get the page
	except Exception as e:
	cprint(f"A pyppeteer error occurred, attempting backup page pull.",'red')
	raw_page = requests.get(url).text #if something goes wrong with pyppeteer, just use requests as a fallback.
	if len(raw_page) <= 100:
	cprint(f"Something went wrong, the page is too short to be valid.",'magenta')
	return False
	else:
	cprint(f"Successfully pulled page content using backup method.\n",'red') # the backup method worked, continue


	############# get just the article part of the html and clean it #############
	#create a dcoument object from the raw html, and then get the summary version of the page, which is the main content of the page
	doc = Document(raw_page)
	html = doc.summary()

	# Create a Beautiful Soup object
	soup = BeautifulSoup(html, 'html.parser')

	# Remove all tags with no content or only whitespace
	for tag in soup.find_all(True):
	if not tag.contents or (tag.string and tag.string.strip() == ''):
	tag.extract()

	#add linebreaksafter certain tags
	for tag_name in ['ol', 'ul', 'li', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
	for tag in soup.find_all(tag_name):
	tag.insert_after(NavigableString('\n\n'))

	#Strip HTML tags
	clean_text = soup.get_text()

	#remove extra linebreaks
	clean_text = re.sub('\s\n+\s', '\n\n', clean_text)

	#unsmarten quotes
	clean_text = re.sub(r"[‘’]", "'", clean_text) # Replace smart single quotes with standard single quotes
	clean_text = re.sub(r"[“”]", '"', clean_text) # Replace smart double quotes with standard double quotes
	clean_text = re.sub(r"'", "'", clean_text) # Replace double quotes with single quote, helps on json output, since in json double quotes need to be escaped

	#remove extra spaces at the start and end of the text
	clean_text = clean_text.strip()

	if debug:
	cprint(f"Cleaned text:",'red')
	paragraphs = clean_text.split('\n\n')
	wrapped_paragraphs = [textwrap.fill(p, width=80, subsequent_indent='\t', initial_indent='\t') for p in paragraphs]
	wrapped_value = '\n\n'.join(wrapped_paragraphs)
	cprint(f"{wrapped_value}\n",'yellow')

	#clean_text = json.dumps(clean_text)

	ascii_text = unidecode(clean_text)


	return ascii_text


	#run the ollama request
	def ollama(prompt,debug=False):

	data = {
	"model": "llama3",
	"prompt": prompt,
	"stream": False,
	"format": "json",
	}
	try:
	response = requests.post('http://localhost:11434/api/generate', data=json.dumps(data), timeout=60)
	response_data = response.json()
	output=response_data['response'].strip()
	json_output = json.loads(output, object_pairs_hook=OrderedDict)
	if debug:
	pprint(json_output)

	except Exception as e:
	cprint(f"Ollama Error:\n\t{e}","red")
	return False

	return json_output


	#little function to indent text
	def indent(text):
	return textwrap.fill(text, width=80, subsequent_indent=' ',initial_indent=' ')







	#get the url from the command line
	try:
	url = sys.argv[1]
	except IndexError:
	cprint("You need to pass in a url as an argument.",'magenta')
	exit()

	#get the debug flag from the command line, we'll make sure it's actually a True/False
	try:
	debug = bool(sys.argv[2])
	if not isinstance(debug, bool):
	debug = False
	except IndexError:
	debug = False




	#get the page's text
	start_time = time.time()
	page = get_page(url,debug)
	page_execution_time = round(time.time() - start_time, 3) # convert to milliseconds

	if page == False:
	cprint("Something went wrong, with the page pull exiting.",'magenta')
	exit()




	#build the prompt
	prompt = f"Read the ARTICLE below.\n"
	prompt += f"Write a no-nonsense HEADLINE for the article.\n"
	prompt += f"Generate a four sentence SUMMARY for the article focusing on facts, written to sound neutral like National Public Radio (NPR).\n"
	prompt += f"Generate five to twenty KEYWORDS, keywords should be one or two words, ignore common keywords and illustrate how this article is different than others, and output fewer rather than more keywords.\n"
	prompt += f"The output JSON should have EXACTLY 3 keys HEADLINE, SUMMARY, and KEYWORDS:\n"
	prompt += '{"HEADLINE":"...", "SUMMARY":"...", "KEYWORDS":["...", "...", "...", "..."]}\n\n'
	prompt += f"\nARTICLE:\n{page}"

	if debug:
	cprint(prompt,'green')




	#run the prompt through ollama
	start_time = time.time()
	output = ollama(prompt,debug)
	ollama_execution_time = round(time.time() - start_time, 3) # convert to milliseconds

	if output == False:
	cprint("Something went wrong, with the Ollama request exiting.",'magenta')
	exit()




	#parse and output the results
	for key, value in output.items():
	if key.upper() == "HEADLINE" or key.upper() == "SUMMARY" or key.upper() == "KEYWORDS":
	if isinstance(value, list):
	print(f"{key.capitalize()}:")
	for item in value:
	print(indent(item))
	print()
	else:
	print(f"{key.capitalize()}:\n{indent(value)}\n")

	print(f"Execution Time:\n Page Collection - {round(page_execution_time)} seconds\n Ollama Request - {round(ollama_execution_time)} seconds")