Skip to content

Instantly share code, notes, and snippets.

@uberdragon
Last active May 11, 2023 17:19
Show Gist options
  • Save uberdragon/b779de99e39af4dad91d5e1fc39221eb to your computer and use it in GitHub Desktop.
Save uberdragon/b779de99e39af4dad91d5e1fc39221eb to your computer and use it in GitHub Desktop.
python script to scrape URLs and create prompt-response pairs suitable for AI LLM tuning
#!/usr/bin/env python3
"""
fetchTuningPairs.py
Author: Shane Kretzmann
Email: ShaneKretzmann@gmail.com
Gist: https://gist.github.com/uberdragon/b779de99e39af4dad91d5e1fc39221eb
Description:
This script fetches LLM tuning PROMPT-RESPONSE pairs from the textual content extracted from a list of URLs.
The extracted text is sent to the chatGPT model for generating high-quality response pairs.
The generated pairs are then saved to individual text files, with each file representing a URL.
Usage:
fetchTuningPairs.py [-h] [-v] [-f FILE] [-o OUTPUT] [-m MODEL] [-n NUM_REQUESTS]
Arguments:
-h, --help Show the help message and exit.
-v, --verbose Enable verbose mode to display detailed information during processing.
-f FILE, --file FILE Path to the file containing the list of URLs to process. (Default: 'urls.txt')
-o OUTPUT, --output OUTPUT Output directory to save the generated PROMPT-RESPONSE pair files. (Default: 'tuningPairs')
-m MODEL, --model MODEL Specify the OpenAI GPT model to use. (Default: 'gpt-3.5-turbo')
-n NUM_REQUESTS, --num_requests NUM_REQUESTS
Number of times each URL should be processed by chatGPT. (Default: 1)
Examples:
1. Run the script with default settings:
fetchTuningPairs.py
2. Run the script with verbose mode and custom file and output folder:
fetchTuningPairs.py -v -f custom_urls.txt -o custom_output
3. Run the script with a specific model:
fetchTuningPairs.py -m gpt-4
Notes:
- Ensure you have set the OPENAI_API_KEY environment variable with your OpenAI API key.
- The script uses the newspaper library to scrape text content from the URLs.
- Generated prompt-response pairs are saved as individual text files based on the URL's domain and path.
"""
import os
import subprocess
import sys
import importlib
def check_dependencies():
package_mapping = {
"newspaper": "newspaper3k",
"openai": "openai",
"tqdm": "tqdm",
"nltk": "nltk",
}
missing_packages = [
package
for package, pip_name in package_mapping.items()
if not importlib.util.find_spec(package)
]
if missing_packages:
print("Some dependencies are missing. Installing...")
for package in missing_packages:
subprocess.check_call(
[sys.executable, "-m", "pip", "install", package_mapping[package]]
)
print("Dependencies installed.")
# Check for missing dependencies
check_dependencies()
import re
import argparse
import openai
import newspaper
import nltk
from tqdm import tqdm
from urllib.parse import urlparse
def count_tokens(text):
tokens = nltk.word_tokenize(text)
return len(tokens)
def split_text_into_chunks(text, max_tokens):
tokens = nltk.word_tokenize(text)
chunks = []
current_chunk = []
for token in tokens:
current_chunk.append(token)
if len(current_chunk) >= max_tokens:
chunks.append(" ".join(current_chunk))
current_chunk = []
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
def scrape_text_from_url(url):
article = newspaper.Article(
url,
language="en",
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
)
article.download()
article.parse()
return article.text
def clean_text(text):
text = re.sub(r"\n", " ", text)
text = re.sub(r"\s+", " ", text)
return text.strip()
def chatgpt_request(prompt, model, response_tokens):
response = openai.ChatCompletion.create(
model=model,
messages=[
{
"role": "system",
"content": "Analyze the text provided by the user and generate at least 30 to 50 high quality and verbose LLM tuning response-pairs without exception. Utilize the format: 'PROMPT: [prompt]\nRESPONSE: [response]\n'. Extract prompt-response pairs based on facts from the user's text. Employ a variety of strategies to ensure the minimum pairs requirement is met, including rewording previous prompts or responses for alternative phrasings, exploring various viewpoints for contrasting opinions or perspectives, and applying reasoning to make inferences based on facts in the user's text. To maximize the number of pairs produced, utilize techniques such as multi-level analysis to investigate the text at different levels of granularity, question generation to create prompts from relevant questions, and theme identification to focus on central themes or topics within the text. Don't return prompts whose response contains anything similar to 'is not mentioned in the provided text' or 'no information in the provided text' in your response. Fully exploit all available tokens for your response to increase the number of pairs generated. You must meet the minimum pair count requirements at all cost.",
},
{"role": "user", "content": prompt},
],
max_tokens=response_tokens,
temperature=0.7,
top_p=1,
n=1,
)
return response["choices"][0]["message"]["content"].strip()
def count_pairs(text):
pairs = text.split("\n\n")
count = 0
for pair in pairs:
if pair.startswith("PROMPT:") and "RESPONSE:" in pair:
count += 1
return count
def generate_pairs_from_text(url, text_content, args):
verbose = args.verbose
model = args.model
model_token_limits = {
"gpt-3.5": 4000,
"gpt-3.5-turbo": 4000,
"gpt-4": 8100,
}
cleaned_text = clean_text(text_content)
chunk_limit = (
model_token_limits.get(model, 4000) - 3000
) # Reserve at least 3000 tokens for response from chatGPT
total_token_count = count_tokens(cleaned_text)
if total_token_count < 1:
if verbose:
print(
f"Extraction Failed - No Tokens Detected... nothing to send to chatGPT... Unable to generate PROMPT-RESPONSE pairs."
)
return None
else:
if verbose:
print(f"{total_token_count} tokens extracted from URL...")
if total_token_count <= chunk_limit:
text_chunks = [cleaned_text]
else:
text_chunks = split_text_into_chunks(cleaned_text, chunk_limit)
if verbose:
print(
f"Splitting tokens into chunks to ensure we are within the model token limits..."
)
prompt_response_pairs_list = []
for text_chunk in text_chunks:
response_token_limit = (
model_token_limits.get(model, 4000) - count_tokens(text_chunk) - 200
) # 200 system prompt tokens
if verbose:
print(
f"Sending {count_tokens(text_chunk)} tokens to {model} for processing... Awaiting response..."
)
prompt_response_pairs = chatgpt_request(text_chunk, model, response_token_limit)
if verbose:
print(
f"{count_pairs(prompt_response_pairs)} PROMPT-RESPONSE pairs generated."
)
prompt_response_pairs_list.append(prompt_response_pairs)
return "\n".join(prompt_response_pairs_list)
def safe_filename_from_url(url):
parsed = urlparse(url)
parsed_filename = f"{parsed.netloc}.{parsed.path.replace('/', '.')}.txt"
filename = re.sub(r"\.+", ".", parsed_filename)
return re.sub(r"[^a-zA-Z0-9\.\-_]", "", filename)
def read_urls_from_file(file_path):
if not os.path.exists(file_path):
print(
f"Error: File '{file_path}' does not exist. Please provide a valid file containing URLs."
)
sys.exit(1)
with open(file_path, "r") as file:
urls = file.readlines()
return [url.strip() for url in urls]
def extract_unique_pairs(existing_pairs, new_pairs):
unique_pairs = set(existing_pairs)
for pair in new_pairs.split("\n\n"):
if pair not in unique_pairs:
unique_pairs.add(pair)
return unique_pairs
def save_pairs_to_file(pairs, file_path, verbose):
if pairs is None:
if verbose:
print(f"Nothing to save...\n")
else:
with open(file_path, "w") as file:
file.write(pairs)
if verbose:
print(f"Save Successful.\n")
def parse_arguments():
parser = argparse.ArgumentParser(
description="fetchTuningPairs.py - Fetches LLM tuning PROMPT-RESPONSE pairs from textual content extracted from URLs."
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
help="Enable verbose output for more detailed information during execution.",
)
parser.add_argument(
"-f",
"--file",
type=str,
default="urls.txt",
help="Path to the file containing the list of URLs to process.",
)
parser.add_argument(
"-o",
"--output",
type=str,
default="tuningPairs",
help="Output directory to save the generated PROMPT-RESPONSE pair files.",
)
parser.add_argument(
"-m",
"--model",
type=str,
default="gpt-3.5-turbo",
help="Specify the OpenAI GPT model to use.",
)
parser.add_argument(
"-n",
"--num_requests",
type=int,
default=1,
help="Number of times each URL should be processed by chatGPT.",
)
return parser.parse_args()
def print_apikey_instructions_and_exit():
print("The OPENAI_API_KEY environment variable is not set.")
print("\nTo set the environment variable, follow these steps:")
print("1. Open your terminal or command prompt.")
print("2. Add the environment variable based on your operating system:")
print(
" - For Linux/macOS, run terminal command: export OPENAI_API_KEY='your-api-key'"
)
print(" - For Windows, run terminal command: setx OPENAI_API_KEY 'your-api-key'")
print("\nMake sure to replace 'your-api-key' with your actual API key.")
print("After setting the environment variable try running the script again.")
sys.exit(1)
def main():
if not os.environ.get("OPENAI_API_KEY"):
print_apikey_instructions_and_exit()
else:
openai.api_key = os.environ.get("OPENAI_API_KEY")
args = parse_arguments()
verbose = args.verbose
file_of_urls = args.file
output_dir = args.output
model = args.model
os.makedirs(output_dir, exist_ok=True)
urls = read_urls_from_file(file_of_urls)
total_urls = len(urls)
if total_urls < 1:
print(
f"Error {file_of_urls} is empty. Please add URLS or provide an alternative URL file using the --file argument. Use --help for more info."
)
sys.exit(1)
bar_format = "{l_bar}▇{bar}░| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]"
for i, url in enumerate(
tqdm(
urls, desc="Processing URLs", unit="URL", bar_format=bar_format, leave=True
)
):
if verbose:
print(f"Processing URL {i + 1}/{total_urls}: {url}")
prompt_response_pairs_set = set()
# Scrape the text from the URL
if verbose:
print(f"Attempting to Extract text from URL...")
text_content = scrape_text_from_url(url)
for _ in range(args.num_requests):
try:
# Call the renamed function and pass the text_content
prompt_response_pairs = generate_pairs_from_text(
url, text_content, args
)
# Extract unique pairs
if verbose:
print(f"Ensuring unique prompt-response pairs... ")
prompt_response_pairs_set = extract_unique_pairs(
prompt_response_pairs_set, prompt_response_pairs
)
except Exception as e:
print(f"Error processing URL {url}: {e}")
combined_pairs = "\n\n".join(prompt_response_pairs_set)
safe_filename = safe_filename_from_url(url)
output_file = os.path.join(output_dir, f"{safe_filename}")
if verbose and combined_pairs != None:
print(f"Saving prompt-response pairs to {output_file}")
save_pairs_to_file(combined_pairs, output_file, verbose)
print(f"======= Processing of {args.file} is complete. ======\n")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment