avelican/main.py

## main.py
from semiauto import search, load, summarize, save

"""
Run Query
Extract links
for each link:
	get text, title etc
	summarize text based on goal (extract desired info)
		NOTE: need to build a recursive summarizer
		      splitting based on paragraphs (+ a few extra before and after) will give superior results.

Save results to file(s) (JSON? Plain text?)
"""

print("Welcome to SemiAuto for GPT.")
print("Please enter a search Query, and a summarization Task.")

query = input("Query: ")
task = input("Task: ")

OUTPUT_FILE = query + '.txt'

print("Searching web for: " + query)

links = search(query)

# TODO: Cache search results?

# TODO Rewrite save_summary and the for-loop below to be more elegant

def save_summary(url, title, summary):
	output = ''
	output += title + '\n'
	output += url + '\n'
	output += summary + '\n'
	output += '\n'
	didSave = save(OUTPUT_FILE, output) # save() will append, not overwrite
	return didSave


for link in links:
	url = link['href']
	title = link['title']
	print("Loading " + url)

	text = load(url)
	if not text:
		print("Failed to load page. Skipping")
		continue

	print("Obtained text. Summarizing with GPT ")
	summary = summarize(text, task)

	print("Obtained summary. Saving to disk")
	if not save_summary(url, title, summary):
		print("Error! Failed to save file: " + OUTPUT_FILE)
	else:
		print(f"Updated summary file {OUTPUT_FILE} with summary for: {title}")

print("Done. Please see " + OUTPUT_FILE)

## requirements.txt
requests
beautifulsoup4
openai
tiktoken
duckduckgo-search

## semiauto.py
# mostly frankensteined from https://github.com/Torantulino/Auto-GPT

from duckduckgo_search import ddg
import requests
from bs4 import BeautifulSoup
import json

import os
import openai

from util import count_string_tokens

openai.api_key = os.getenv("OPENAI_API_KEY")

work_dir = "output"

OPENAI_MODEL="text-davinci-003"
MAX_OUTPUT_LENGTH = 500
MAX_CONTEXT_LENGTH = 4097 # text-davinci-003 is 4097
MAX_PROMPT_LENGTH = MAX_CONTEXT_LENGTH - MAX_OUTPUT_LENGTH

def gpt(prompt, max_tokens=MAX_OUTPUT_LENGTH):
    response = openai.Completion.create(
      # model="gpt-3.5-turbo", # Alas, chat-only... same for gpt-4. TODO ?
      model=OPENAI_MODEL,
      prompt=prompt,
      temperature=0,
      max_tokens=max_tokens,
      top_p=1.0,
      frequency_penalty=0.0,
      presence_penalty=0.0
    )
    # return response # todo
    return response.choices[0].text.strip()

### COMMANDS ###

def search(query, max_results=8):
    search_results = []
    results = ddg(query, max_results=max_results)
    # results is a list of dicts:
    """
    [
        {
            "title": "9 Best AI Image Upscalers of April 2023 (Enhance Photos by 800%)",
            "href": "https://www.codingem.com/best-ai-image-upscalers/",
            "body": "9 Best AI Image Upscalers of 2023 (Enhance Photos by 800%) By Artturi Jalli Choosing the best AI image upscaler is crucial to make your images look great when changing their size. With the right type of AI image upscaler you can improve your image resolution by 800%! This is a comprehensive guide to choosing the best AI image upscaler."
        },
        ...
    """
    # return json.dumps(search_results, ensure_ascii=False, indent=4)
    return results


def load(url):
    user_agent_header = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}

    # Most basic check if the URL is valid:
    if not url.startswith('http'):
        raise Exception("Invalid URL")

    try:
        response = requests.get(url, headers=user_agent_header)
    except requests.exceptions.RequestException as e:
        print("ERROR (requests): " + str(e))
        return None
        # return "Error: " + str(e)

    # Check if the response contains an HTTP error
    if response.status_code >= 400:
        print("ERROR (requests): error code " + str(response.status_code) + " for page " + url)
        return None
        # return "Error: HTTP " + str(response.status_code) + " error"

    soup = BeautifulSoup(response.text, "html.parser")

    for script in soup(["script", "style"]): # remove cruft
        script.extract()

    text = soup.get_text()
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  ")) # TODO: Why this?
    text = '\n'.join(chunk for chunk in chunks if chunk)

    return text


# def get_summarize_prompt(text, task):
#     return f"""Summarize the following text, with the following goal in mind:

# Task:{task}

# ###

# {text}

# ###

# Summary:"""

def get_summarize_prompt(text, task):
    return f"""{text}

###

Using the text above, perform the following task.

Task:{task}

Output:"""


SUMMARIZER_PROMPT_LENGTH = count_string_tokens(get_summarize_prompt('', '')) # checks how many tokens are used by the summary prompt template
MAX_SUMMARY_INPUT_TEXT_LENGTH = MAX_PROMPT_LENGTH - SUMMARIZER_PROMPT_LENGTH

def split_text_tok(text, chunk_len):
    raise Exception("not implemented")

def split_text_ch(text, chunk_len):
    res = []
    while(len(text) > 0):
        res.append(text[0:chunk_len])
        text = text[chunk_len:]
    return res

    # todo split by sentences, paragraphs etc.
    # See how to get paragraphs from soup
    # # ( Does the "  " do that? )
    # TODO use tokens instead of chars
    # Can tiktoken split by tokens? probably

def summarize_actual(text, task):
    # TODO: make recursive
    # TODO: split by paragraph ( and include 1 or 2 paragraphs before/after)
    prompt = get_summarize_prompt(text, task)
    return gpt(prompt)

def summarize(text, task):
    if(count_string_tokens(text) > MAX_SUMMARY_INPUT_TEXT_LENGTH):
        chunk_len = int(MAX_SUMMARY_INPUT_TEXT_LENGTH*3*0.95) # remove 5% just to be safe. The error seems to be about 1%
        chunks = split_text_ch(text, chunk_len)
    else:
        chunks = [text]
    output = ''
    for chunk in chunks:
        output += summarize_actual(chunk, task) + '\n\n'
    return output

def safe_join(base, *paths):
    """Join one or more path components intelligently."""
    new_path = os.path.join(base, *paths)
    norm_new_path = os.path.normpath(new_path)

    if os.path.commonprefix([base, norm_new_path]) != base:
        raise ValueError("Attempted to access outside of working directory.")

    return norm_new_path

def save(name, text):
    # TODO: make a new folder for each run? Else prefix output w timestamp
    output_data = text + "\n\n"
    try:
        filepath = safe_join(work_dir, name)
        with open(filepath, "a") as f:
            f.write(output_data)
        return True
        # return "Text appended successfully."
    except Exception as e:
        print("ERROR: Could not save file " + name + " because: " + str(e))
        return False
        # return "Error: " + str(e)


## util.py
# from https://github.com/Torantulino/Auto-GPT

import tiktoken
from typing import List, Dict

def count_message_tokens(messages : List[Dict[str, str]], model : str = "gpt-3.5-turbo-0301") -> int:
    """
    Returns the number of tokens used by a list of messages.

    Args:
    messages (list): A list of messages, each of which is a dictionary containing the role and content of the message.
    model (str): The name of the model to use for tokenization. Defaults to "gpt-3.5-turbo-0301".

    Returns:
    int: The number of tokens used by the list of messages.
    """
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model == "gpt-3.5-turbo":
        # !Node: gpt-3.5-turbo may change over time. Returning num tokens assuming gpt-3.5-turbo-0301.")
        return count_message_tokens(messages, model="gpt-3.5-turbo-0301")
    elif model == "gpt-4":
        # !Note: gpt-4 may change over time. Returning num tokens assuming gpt-4-0314.")
        return count_message_tokens(messages, model="gpt-4-0314")
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif model == "gpt-4-0314":
        tokens_per_message = 3
        tokens_per_name = 1
    else:
        raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

def count_string_tokens(string: str, model_name: str = "gpt-3.5-turbo-0301") -> int:
    """
    Returns the number of tokens in a text string.

    Args:
    string (str): The text string.
    model_name (str): The name of the encoding to use. (e.g., "gpt-3.5-turbo")

    Returns:
    int: The number of tokens in the text string.
    """
    encoding = tiktoken.encoding_for_model(model_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens
	from semiauto import search, load, summarize, save

	"""
	Run Query
	Extract links
	for each link:
	get text, title etc
	summarize text based on goal (extract desired info)
	NOTE: need to build a recursive summarizer
	splitting based on paragraphs (+ a few extra before and after) will give superior results.

	Save results to file(s) (JSON? Plain text?)
	"""

	print("Welcome to SemiAuto for GPT.")
	print("Please enter a search Query, and a summarization Task.")

	query = input("Query: ")
	task = input("Task: ")

	OUTPUT_FILE = query + '.txt'

	print("Searching web for: " + query)

	links = search(query)

	# TODO: Cache search results?

	# TODO Rewrite save_summary and the for-loop below to be more elegant

	def save_summary(url, title, summary):
	output = ''
	output += title + '\n'
	output += url + '\n'
	output += summary + '\n'
	output += '\n'
	didSave = save(OUTPUT_FILE, output) # save() will append, not overwrite
	return didSave


	for link in links:
	url = link['href']
	title = link['title']
	print("Loading " + url)

	text = load(url)
	if not text:
	print("Failed to load page. Skipping")
	continue

	print("Obtained text. Summarizing with GPT ")
	summary = summarize(text, task)

	print("Obtained summary. Saving to disk")
	if not save_summary(url, title, summary):
	print("Error! Failed to save file: " + OUTPUT_FILE)
	else:
	print(f"Updated summary file {OUTPUT_FILE} with summary for: {title}")

	print("Done. Please see " + OUTPUT_FILE)
	# mostly frankensteined from https://github.com/Torantulino/Auto-GPT

	from duckduckgo_search import ddg
	import requests
	from bs4 import BeautifulSoup
	import json

	import os
	import openai

	from util import count_string_tokens

	openai.api_key = os.getenv("OPENAI_API_KEY")

	work_dir = "output"

	OPENAI_MODEL="text-davinci-003"
	MAX_OUTPUT_LENGTH = 500
	MAX_CONTEXT_LENGTH = 4097 # text-davinci-003 is 4097
	MAX_PROMPT_LENGTH = MAX_CONTEXT_LENGTH - MAX_OUTPUT_LENGTH

	def gpt(prompt, max_tokens=MAX_OUTPUT_LENGTH):
	response = openai.Completion.create(
	# model="gpt-3.5-turbo", # Alas, chat-only... same for gpt-4. TODO ?
	model=OPENAI_MODEL,
	prompt=prompt,
	temperature=0,
	max_tokens=max_tokens,
	top_p=1.0,
	frequency_penalty=0.0,
	presence_penalty=0.0
	)
	# return response # todo
	return response.choices[0].text.strip()

	### COMMANDS ###

	def search(query, max_results=8):
	search_results = []
	results = ddg(query, max_results=max_results)
	# results is a list of dicts:
	"""
	[
	{
	"title": "9 Best AI Image Upscalers of April 2023 (Enhance Photos by 800%)",
	"href": "https://www.codingem.com/best-ai-image-upscalers/",
	"body": "9 Best AI Image Upscalers of 2023 (Enhance Photos by 800%) By Artturi Jalli Choosing the best AI image upscaler is crucial to make your images look great when changing their size. With the right type of AI image upscaler you can improve your image resolution by 800%! This is a comprehensive guide to choosing the best AI image upscaler."
	},
	...
	"""
	# return json.dumps(search_results, ensure_ascii=False, indent=4)
	return results


	def load(url):
	user_agent_header = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}

	# Most basic check if the URL is valid:
	if not url.startswith('http'):
	raise Exception("Invalid URL")

	try:
	response = requests.get(url, headers=user_agent_header)
	except requests.exceptions.RequestException as e:
	print("ERROR (requests): " + str(e))
	return None
	# return "Error: " + str(e)

	# Check if the response contains an HTTP error
	if response.status_code >= 400:
	print("ERROR (requests): error code " + str(response.status_code) + " for page " + url)
	return None
	# return "Error: HTTP " + str(response.status_code) + " error"

	soup = BeautifulSoup(response.text, "html.parser")

	for script in soup(["script", "style"]): # remove cruft
	script.extract()

	text = soup.get_text()
	lines = (line.strip() for line in text.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # TODO: Why this?
	text = '\n'.join(chunk for chunk in chunks if chunk)

	return text



	# def get_summarize_prompt(text, task):
	# return f"""Summarize the following text, with the following goal in mind:

	# Task:{task}

	# ###

	# {text}

	# ###

	# Summary:"""

	def get_summarize_prompt(text, task):
	return f"""{text}

	###

	Using the text above, perform the following task.

	Task:{task}

	Output:"""


	SUMMARIZER_PROMPT_LENGTH = count_string_tokens(get_summarize_prompt('', '')) # checks how many tokens are used by the summary prompt template
	MAX_SUMMARY_INPUT_TEXT_LENGTH = MAX_PROMPT_LENGTH - SUMMARIZER_PROMPT_LENGTH

	def split_text_tok(text, chunk_len):
	raise Exception("not implemented")

	def split_text_ch(text, chunk_len):
	res = []
	while(len(text) > 0):
	res.append(text[0:chunk_len])
	text = text[chunk_len:]
	return res

	# todo split by sentences, paragraphs etc.
	# See how to get paragraphs from soup
	# # ( Does the " " do that? )
	# TODO use tokens instead of chars
	# Can tiktoken split by tokens? probably

	def summarize_actual(text, task):
	# TODO: make recursive
	# TODO: split by paragraph ( and include 1 or 2 paragraphs before/after)
	prompt = get_summarize_prompt(text, task)
	return gpt(prompt)

	def summarize(text, task):
	if(count_string_tokens(text) > MAX_SUMMARY_INPUT_TEXT_LENGTH):
	chunk_len = int(MAX_SUMMARY_INPUT_TEXT_LENGTH30.95) # remove 5% just to be safe. The error seems to be about 1%
	chunks = split_text_ch(text, chunk_len)
	else:
	chunks = [text]
	output = ''
	for chunk in chunks:
	output += summarize_actual(chunk, task) + '\n\n'
	return output

	def safe_join(base, *paths):
	"""Join one or more path components intelligently."""
	new_path = os.path.join(base, *paths)
	norm_new_path = os.path.normpath(new_path)

	if os.path.commonprefix([base, norm_new_path]) != base:
	raise ValueError("Attempted to access outside of working directory.")

	return norm_new_path

	def save(name, text):
	# TODO: make a new folder for each run? Else prefix output w timestamp
	output_data = text + "\n\n"
	try:
	filepath = safe_join(work_dir, name)
	with open(filepath, "a") as f:
	f.write(output_data)
	return True
	# return "Text appended successfully."
	except Exception as e:
	print("ERROR: Could not save file " + name + " because: " + str(e))
	return False
	# return "Error: " + str(e)
	# from https://github.com/Torantulino/Auto-GPT

	import tiktoken
	from typing import List, Dict

	def count_message_tokens(messages : List[Dict[str, str]], model : str = "gpt-3.5-turbo-0301") -> int:
	"""
	Returns the number of tokens used by a list of messages.

	Args:
	messages (list): A list of messages, each of which is a dictionary containing the role and content of the message.
	model (str): The name of the model to use for tokenization. Defaults to "gpt-3.5-turbo-0301".

	Returns:
	int: The number of tokens used by the list of messages.
	"""
	try:
	encoding = tiktoken.encoding_for_model(model)
	except KeyError:
	print("Warning: model not found. Using cl100k_base encoding.")
	encoding = tiktoken.get_encoding("cl100k_base")
	if model == "gpt-3.5-turbo":
	# !Node: gpt-3.5-turbo may change over time. Returning num tokens assuming gpt-3.5-turbo-0301.")
	return count_message_tokens(messages, model="gpt-3.5-turbo-0301")
	elif model == "gpt-4":
	# !Note: gpt-4 may change over time. Returning num tokens assuming gpt-4-0314.")
	return count_message_tokens(messages, model="gpt-4-0314")
	elif model == "gpt-3.5-turbo-0301":
	tokens_per_message = 4 # every message follows <\|start\|>{role/name}\n{content}<\|end\|>\n
	tokens_per_name = -1 # if there's a name, the role is omitted
	elif model == "gpt-4-0314":
	tokens_per_message = 3
	tokens_per_name = 1
	else:
	raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")
	num_tokens = 0
	for message in messages:
	num_tokens += tokens_per_message
	for key, value in message.items():
	num_tokens += len(encoding.encode(value))
	if key == "name":
	num_tokens += tokens_per_name
	num_tokens += 3 # every reply is primed with <\|start\|>assistant<\|message\|>
	return num_tokens

	def count_string_tokens(string: str, model_name: str = "gpt-3.5-turbo-0301") -> int:
	"""
	Returns the number of tokens in a text string.

	Args:
	string (str): The text string.
	model_name (str): The name of the encoding to use. (e.g., "gpt-3.5-turbo")

	Returns:
	int: The number of tokens in the text string.
	"""
	encoding = tiktoken.encoding_for_model(model_name)
	num_tokens = len(encoding.encode(string))
	return num_tokens