Created
April 12, 2023 19:25
-
-
Save avelican/2d4e718954593e3df9e0e5ee6751f470 to your computer and use it in GitHub Desktop.
SemiAuto for GPT (draft)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from semiauto import search, load, summarize, save | |
""" | |
Run Query | |
Extract links | |
for each link: | |
get text, title etc | |
summarize text based on goal (extract desired info) | |
NOTE: need to build a recursive summarizer | |
splitting based on paragraphs (+ a few extra before and after) will give superior results. | |
Save results to file(s) (JSON? Plain text?) | |
""" | |
print("Welcome to SemiAuto for GPT.") | |
print("Please enter a search Query, and a summarization Task.") | |
query = input("Query: ") | |
task = input("Task: ") | |
OUTPUT_FILE = query + '.txt' | |
print("Searching web for: " + query) | |
links = search(query) | |
# TODO: Cache search results? | |
# TODO Rewrite save_summary and the for-loop below to be more elegant | |
def save_summary(url, title, summary): | |
output = '' | |
output += title + '\n' | |
output += url + '\n' | |
output += summary + '\n' | |
output += '\n' | |
didSave = save(OUTPUT_FILE, output) # save() will append, not overwrite | |
return didSave | |
for link in links: | |
url = link['href'] | |
title = link['title'] | |
print("Loading " + url) | |
text = load(url) | |
if not text: | |
print("Failed to load page. Skipping") | |
continue | |
print("Obtained text. Summarizing with GPT ") | |
summary = summarize(text, task) | |
print("Obtained summary. Saving to disk") | |
if not save_summary(url, title, summary): | |
print("Error! Failed to save file: " + OUTPUT_FILE) | |
else: | |
print(f"Updated summary file {OUTPUT_FILE} with summary for: {title}") | |
print("Done. Please see " + OUTPUT_FILE) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
requests | |
beautifulsoup4 | |
openai | |
tiktoken | |
duckduckgo-search |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# mostly frankensteined from https://github.com/Torantulino/Auto-GPT | |
from duckduckgo_search import ddg | |
import requests | |
from bs4 import BeautifulSoup | |
import json | |
import os | |
import openai | |
from util import count_string_tokens | |
openai.api_key = os.getenv("OPENAI_API_KEY") | |
work_dir = "output" | |
OPENAI_MODEL="text-davinci-003" | |
MAX_OUTPUT_LENGTH = 500 | |
MAX_CONTEXT_LENGTH = 4097 # text-davinci-003 is 4097 | |
MAX_PROMPT_LENGTH = MAX_CONTEXT_LENGTH - MAX_OUTPUT_LENGTH | |
def gpt(prompt, max_tokens=MAX_OUTPUT_LENGTH): | |
response = openai.Completion.create( | |
# model="gpt-3.5-turbo", # Alas, chat-only... same for gpt-4. TODO ? | |
model=OPENAI_MODEL, | |
prompt=prompt, | |
temperature=0, | |
max_tokens=max_tokens, | |
top_p=1.0, | |
frequency_penalty=0.0, | |
presence_penalty=0.0 | |
) | |
# return response # todo | |
return response.choices[0].text.strip() | |
### COMMANDS ### | |
def search(query, max_results=8): | |
search_results = [] | |
results = ddg(query, max_results=max_results) | |
# results is a list of dicts: | |
""" | |
[ | |
{ | |
"title": "9 Best AI Image Upscalers of April 2023 (Enhance Photos by 800%)", | |
"href": "https://www.codingem.com/best-ai-image-upscalers/", | |
"body": "9 Best AI Image Upscalers of 2023 (Enhance Photos by 800%) By Artturi Jalli Choosing the best AI image upscaler is crucial to make your images look great when changing their size. With the right type of AI image upscaler you can improve your image resolution by 800%! This is a comprehensive guide to choosing the best AI image upscaler." | |
}, | |
... | |
""" | |
# return json.dumps(search_results, ensure_ascii=False, indent=4) | |
return results | |
def load(url): | |
user_agent_header = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"} | |
# Most basic check if the URL is valid: | |
if not url.startswith('http'): | |
raise Exception("Invalid URL") | |
try: | |
response = requests.get(url, headers=user_agent_header) | |
except requests.exceptions.RequestException as e: | |
print("ERROR (requests): " + str(e)) | |
return None | |
# return "Error: " + str(e) | |
# Check if the response contains an HTTP error | |
if response.status_code >= 400: | |
print("ERROR (requests): error code " + str(response.status_code) + " for page " + url) | |
return None | |
# return "Error: HTTP " + str(response.status_code) + " error" | |
soup = BeautifulSoup(response.text, "html.parser") | |
for script in soup(["script", "style"]): # remove cruft | |
script.extract() | |
text = soup.get_text() | |
lines = (line.strip() for line in text.splitlines()) | |
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # TODO: Why this? | |
text = '\n'.join(chunk for chunk in chunks if chunk) | |
return text | |
# def get_summarize_prompt(text, task): | |
# return f"""Summarize the following text, with the following goal in mind: | |
# Task:{task} | |
# ### | |
# {text} | |
# ### | |
# Summary:""" | |
def get_summarize_prompt(text, task): | |
return f"""{text} | |
### | |
Using the text above, perform the following task. | |
Task:{task} | |
Output:""" | |
SUMMARIZER_PROMPT_LENGTH = count_string_tokens(get_summarize_prompt('', '')) # checks how many tokens are used by the summary prompt template | |
MAX_SUMMARY_INPUT_TEXT_LENGTH = MAX_PROMPT_LENGTH - SUMMARIZER_PROMPT_LENGTH | |
def split_text_tok(text, chunk_len): | |
raise Exception("not implemented") | |
def split_text_ch(text, chunk_len): | |
res = [] | |
while(len(text) > 0): | |
res.append(text[0:chunk_len]) | |
text = text[chunk_len:] | |
return res | |
# todo split by sentences, paragraphs etc. | |
# See how to get paragraphs from soup | |
# # ( Does the " " do that? ) | |
# TODO use tokens instead of chars | |
# Can tiktoken split by tokens? probably | |
def summarize_actual(text, task): | |
# TODO: make recursive | |
# TODO: split by paragraph ( and include 1 or 2 paragraphs before/after) | |
prompt = get_summarize_prompt(text, task) | |
return gpt(prompt) | |
def summarize(text, task): | |
if(count_string_tokens(text) > MAX_SUMMARY_INPUT_TEXT_LENGTH): | |
chunk_len = int(MAX_SUMMARY_INPUT_TEXT_LENGTH*3*0.95) # remove 5% just to be safe. The error seems to be about 1% | |
chunks = split_text_ch(text, chunk_len) | |
else: | |
chunks = [text] | |
output = '' | |
for chunk in chunks: | |
output += summarize_actual(chunk, task) + '\n\n' | |
return output | |
def safe_join(base, *paths): | |
"""Join one or more path components intelligently.""" | |
new_path = os.path.join(base, *paths) | |
norm_new_path = os.path.normpath(new_path) | |
if os.path.commonprefix([base, norm_new_path]) != base: | |
raise ValueError("Attempted to access outside of working directory.") | |
return norm_new_path | |
def save(name, text): | |
# TODO: make a new folder for each run? Else prefix output w timestamp | |
output_data = text + "\n\n" | |
try: | |
filepath = safe_join(work_dir, name) | |
with open(filepath, "a") as f: | |
f.write(output_data) | |
return True | |
# return "Text appended successfully." | |
except Exception as e: | |
print("ERROR: Could not save file " + name + " because: " + str(e)) | |
return False | |
# return "Error: " + str(e) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# from https://github.com/Torantulino/Auto-GPT | |
import tiktoken | |
from typing import List, Dict | |
def count_message_tokens(messages : List[Dict[str, str]], model : str = "gpt-3.5-turbo-0301") -> int: | |
""" | |
Returns the number of tokens used by a list of messages. | |
Args: | |
messages (list): A list of messages, each of which is a dictionary containing the role and content of the message. | |
model (str): The name of the model to use for tokenization. Defaults to "gpt-3.5-turbo-0301". | |
Returns: | |
int: The number of tokens used by the list of messages. | |
""" | |
try: | |
encoding = tiktoken.encoding_for_model(model) | |
except KeyError: | |
print("Warning: model not found. Using cl100k_base encoding.") | |
encoding = tiktoken.get_encoding("cl100k_base") | |
if model == "gpt-3.5-turbo": | |
# !Node: gpt-3.5-turbo may change over time. Returning num tokens assuming gpt-3.5-turbo-0301.") | |
return count_message_tokens(messages, model="gpt-3.5-turbo-0301") | |
elif model == "gpt-4": | |
# !Note: gpt-4 may change over time. Returning num tokens assuming gpt-4-0314.") | |
return count_message_tokens(messages, model="gpt-4-0314") | |
elif model == "gpt-3.5-turbo-0301": | |
tokens_per_message = 4 # every message follows <|start|>{role/name}\n{content}<|end|>\n | |
tokens_per_name = -1 # if there's a name, the role is omitted | |
elif model == "gpt-4-0314": | |
tokens_per_message = 3 | |
tokens_per_name = 1 | |
else: | |
raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""") | |
num_tokens = 0 | |
for message in messages: | |
num_tokens += tokens_per_message | |
for key, value in message.items(): | |
num_tokens += len(encoding.encode(value)) | |
if key == "name": | |
num_tokens += tokens_per_name | |
num_tokens += 3 # every reply is primed with <|start|>assistant<|message|> | |
return num_tokens | |
def count_string_tokens(string: str, model_name: str = "gpt-3.5-turbo-0301") -> int: | |
""" | |
Returns the number of tokens in a text string. | |
Args: | |
string (str): The text string. | |
model_name (str): The name of the encoding to use. (e.g., "gpt-3.5-turbo") | |
Returns: | |
int: The number of tokens in the text string. | |
""" | |
encoding = tiktoken.encoding_for_model(model_name) | |
num_tokens = len(encoding.encode(string)) | |
return num_tokens |
This version of main.py was written by ChatGPT-4, based on the interface above.
This version saves each summary to a separate file.
main-gpt4.py
from typing import List, Dict
from semiauto import search, load, summarize, save
def main(query: str, max_results: int = 8, summary_task: str = "summarize"):
# Perform the web search
search_results = search(query, max_results)
# Iterate through the search results
for i, result in enumerate(search_results):
# Load the web page
page_content = load(result['href'])
if page_content is not None:
# Summarize the page content
summary = summarize(page_content, summary_task)
if summary is not None:
# Save the summary to a file
filename = f"summary_{i + 1}.txt"
saved = save(filename, summary)
if saved:
print(f"Saved summary {i + 1} to {filename}")
else:
print(f"Error saving summary {i + 1} to {filename}")
else:
print(f"Error summarizing page {i + 1}: {result['title']}")
else:
print(f"Error loading page {i + 1}: {result['title']}")
if __name__ == "__main__":
query = input("Enter a search query: ")
task = input("Enter a task:") # NOTE(andai): I added this, to allow more useful tasks than just summarization
main(query, summary_task=task)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This version of main.py was written by ChatGPT-3.5, based on the interface above.
Note: This version doesn't output anything and does not save anything until the end of the program, which may take a few minutes.
main-gpt3.py