Skip to content

Instantly share code, notes, and snippets.

@avelican
Created April 12, 2023 19:25
Show Gist options
  • Save avelican/2d4e718954593e3df9e0e5ee6751f470 to your computer and use it in GitHub Desktop.
Save avelican/2d4e718954593e3df9e0e5ee6751f470 to your computer and use it in GitHub Desktop.
SemiAuto for GPT (draft)
from semiauto import search, load, summarize, save
"""
Run Query
Extract links
for each link:
get text, title etc
summarize text based on goal (extract desired info)
NOTE: need to build a recursive summarizer
splitting based on paragraphs (+ a few extra before and after) will give superior results.
Save results to file(s) (JSON? Plain text?)
"""
print("Welcome to SemiAuto for GPT.")
print("Please enter a search Query, and a summarization Task.")
query = input("Query: ")
task = input("Task: ")
OUTPUT_FILE = query + '.txt'
print("Searching web for: " + query)
links = search(query)
# TODO: Cache search results?
# TODO Rewrite save_summary and the for-loop below to be more elegant
def save_summary(url, title, summary):
output = ''
output += title + '\n'
output += url + '\n'
output += summary + '\n'
output += '\n'
didSave = save(OUTPUT_FILE, output) # save() will append, not overwrite
return didSave
for link in links:
url = link['href']
title = link['title']
print("Loading " + url)
text = load(url)
if not text:
print("Failed to load page. Skipping")
continue
print("Obtained text. Summarizing with GPT ")
summary = summarize(text, task)
print("Obtained summary. Saving to disk")
if not save_summary(url, title, summary):
print("Error! Failed to save file: " + OUTPUT_FILE)
else:
print(f"Updated summary file {OUTPUT_FILE} with summary for: {title}")
print("Done. Please see " + OUTPUT_FILE)
requests
beautifulsoup4
openai
tiktoken
duckduckgo-search
# mostly frankensteined from https://github.com/Torantulino/Auto-GPT
from duckduckgo_search import ddg
import requests
from bs4 import BeautifulSoup
import json
import os
import openai
from util import count_string_tokens
openai.api_key = os.getenv("OPENAI_API_KEY")
work_dir = "output"
OPENAI_MODEL="text-davinci-003"
MAX_OUTPUT_LENGTH = 500
MAX_CONTEXT_LENGTH = 4097 # text-davinci-003 is 4097
MAX_PROMPT_LENGTH = MAX_CONTEXT_LENGTH - MAX_OUTPUT_LENGTH
def gpt(prompt, max_tokens=MAX_OUTPUT_LENGTH):
response = openai.Completion.create(
# model="gpt-3.5-turbo", # Alas, chat-only... same for gpt-4. TODO ?
model=OPENAI_MODEL,
prompt=prompt,
temperature=0,
max_tokens=max_tokens,
top_p=1.0,
frequency_penalty=0.0,
presence_penalty=0.0
)
# return response # todo
return response.choices[0].text.strip()
### COMMANDS ###
def search(query, max_results=8):
search_results = []
results = ddg(query, max_results=max_results)
# results is a list of dicts:
"""
[
{
"title": "9 Best AI Image Upscalers of April 2023 (Enhance Photos by 800%)",
"href": "https://www.codingem.com/best-ai-image-upscalers/",
"body": "9 Best AI Image Upscalers of 2023 (Enhance Photos by 800%) By Artturi Jalli Choosing the best AI image upscaler is crucial to make your images look great when changing their size. With the right type of AI image upscaler you can improve your image resolution by 800%! This is a comprehensive guide to choosing the best AI image upscaler."
},
...
"""
# return json.dumps(search_results, ensure_ascii=False, indent=4)
return results
def load(url):
user_agent_header = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
# Most basic check if the URL is valid:
if not url.startswith('http'):
raise Exception("Invalid URL")
try:
response = requests.get(url, headers=user_agent_header)
except requests.exceptions.RequestException as e:
print("ERROR (requests): " + str(e))
return None
# return "Error: " + str(e)
# Check if the response contains an HTTP error
if response.status_code >= 400:
print("ERROR (requests): error code " + str(response.status_code) + " for page " + url)
return None
# return "Error: HTTP " + str(response.status_code) + " error"
soup = BeautifulSoup(response.text, "html.parser")
for script in soup(["script", "style"]): # remove cruft
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # TODO: Why this?
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
# def get_summarize_prompt(text, task):
# return f"""Summarize the following text, with the following goal in mind:
# Task:{task}
# ###
# {text}
# ###
# Summary:"""
def get_summarize_prompt(text, task):
return f"""{text}
###
Using the text above, perform the following task.
Task:{task}
Output:"""
SUMMARIZER_PROMPT_LENGTH = count_string_tokens(get_summarize_prompt('', '')) # checks how many tokens are used by the summary prompt template
MAX_SUMMARY_INPUT_TEXT_LENGTH = MAX_PROMPT_LENGTH - SUMMARIZER_PROMPT_LENGTH
def split_text_tok(text, chunk_len):
raise Exception("not implemented")
def split_text_ch(text, chunk_len):
res = []
while(len(text) > 0):
res.append(text[0:chunk_len])
text = text[chunk_len:]
return res
# todo split by sentences, paragraphs etc.
# See how to get paragraphs from soup
# # ( Does the " " do that? )
# TODO use tokens instead of chars
# Can tiktoken split by tokens? probably
def summarize_actual(text, task):
# TODO: make recursive
# TODO: split by paragraph ( and include 1 or 2 paragraphs before/after)
prompt = get_summarize_prompt(text, task)
return gpt(prompt)
def summarize(text, task):
if(count_string_tokens(text) > MAX_SUMMARY_INPUT_TEXT_LENGTH):
chunk_len = int(MAX_SUMMARY_INPUT_TEXT_LENGTH*3*0.95) # remove 5% just to be safe. The error seems to be about 1%
chunks = split_text_ch(text, chunk_len)
else:
chunks = [text]
output = ''
for chunk in chunks:
output += summarize_actual(chunk, task) + '\n\n'
return output
def safe_join(base, *paths):
"""Join one or more path components intelligently."""
new_path = os.path.join(base, *paths)
norm_new_path = os.path.normpath(new_path)
if os.path.commonprefix([base, norm_new_path]) != base:
raise ValueError("Attempted to access outside of working directory.")
return norm_new_path
def save(name, text):
# TODO: make a new folder for each run? Else prefix output w timestamp
output_data = text + "\n\n"
try:
filepath = safe_join(work_dir, name)
with open(filepath, "a") as f:
f.write(output_data)
return True
# return "Text appended successfully."
except Exception as e:
print("ERROR: Could not save file " + name + " because: " + str(e))
return False
# return "Error: " + str(e)
# from https://github.com/Torantulino/Auto-GPT
import tiktoken
from typing import List, Dict
def count_message_tokens(messages : List[Dict[str, str]], model : str = "gpt-3.5-turbo-0301") -> int:
"""
Returns the number of tokens used by a list of messages.
Args:
messages (list): A list of messages, each of which is a dictionary containing the role and content of the message.
model (str): The name of the model to use for tokenization. Defaults to "gpt-3.5-turbo-0301".
Returns:
int: The number of tokens used by the list of messages.
"""
try:
encoding = tiktoken.encoding_for_model(model)
except KeyError:
print("Warning: model not found. Using cl100k_base encoding.")
encoding = tiktoken.get_encoding("cl100k_base")
if model == "gpt-3.5-turbo":
# !Node: gpt-3.5-turbo may change over time. Returning num tokens assuming gpt-3.5-turbo-0301.")
return count_message_tokens(messages, model="gpt-3.5-turbo-0301")
elif model == "gpt-4":
# !Note: gpt-4 may change over time. Returning num tokens assuming gpt-4-0314.")
return count_message_tokens(messages, model="gpt-4-0314")
elif model == "gpt-3.5-turbo-0301":
tokens_per_message = 4 # every message follows <|start|>{role/name}\n{content}<|end|>\n
tokens_per_name = -1 # if there's a name, the role is omitted
elif model == "gpt-4-0314":
tokens_per_message = 3
tokens_per_name = 1
else:
raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")
num_tokens = 0
for message in messages:
num_tokens += tokens_per_message
for key, value in message.items():
num_tokens += len(encoding.encode(value))
if key == "name":
num_tokens += tokens_per_name
num_tokens += 3 # every reply is primed with <|start|>assistant<|message|>
return num_tokens
def count_string_tokens(string: str, model_name: str = "gpt-3.5-turbo-0301") -> int:
"""
Returns the number of tokens in a text string.
Args:
string (str): The text string.
model_name (str): The name of the encoding to use. (e.g., "gpt-3.5-turbo")
Returns:
int: The number of tokens in the text string.
"""
encoding = tiktoken.encoding_for_model(model_name)
num_tokens = len(encoding.encode(string))
return num_tokens
@avelican
Copy link
Author

This version of main.py was written by ChatGPT-4, based on the interface above.

This version saves each summary to a separate file.

main-gpt4.py

from typing import List, Dict
from semiauto import search, load, summarize, save

def main(query: str, max_results: int = 8, summary_task: str = "summarize"):
    # Perform the web search
    search_results = search(query, max_results)

    # Iterate through the search results
    for i, result in enumerate(search_results):
        # Load the web page
        page_content = load(result['href'])

        if page_content is not None:
            # Summarize the page content
            summary = summarize(page_content, summary_task)

            if summary is not None:
                # Save the summary to a file
                filename = f"summary_{i + 1}.txt"
                saved = save(filename, summary)

                if saved:
                    print(f"Saved summary {i + 1} to {filename}")
                else:
                    print(f"Error saving summary {i + 1} to {filename}")
            else:
                print(f"Error summarizing page {i + 1}: {result['title']}")
        else:
            print(f"Error loading page {i + 1}: {result['title']}")

if __name__ == "__main__":
    query = input("Enter a search query: ")
    task = input("Enter a task:") # NOTE(andai): I added this, to allow more useful tasks than just summarization
    main(query, summary_task=task)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment