uberdragon/fetchTuningPairs.py

## fetchTuningPairs.py
#!/usr/bin/env python3

"""
fetchTuningPairs.py

Author: Shane Kretzmann
Email: ShaneKretzmann@gmail.com
Gist: https://gist.github.com/uberdragon/b779de99e39af4dad91d5e1fc39221eb

Description:
    This script fetches LLM tuning PROMPT-RESPONSE pairs from the textual content extracted from a list of URLs.
    The extracted text is sent to the chatGPT model for generating high-quality response pairs.
    The generated pairs are then saved to individual text files, with each file representing a URL.

Usage:
    fetchTuningPairs.py [-h] [-v] [-f FILE] [-o OUTPUT] [-m MODEL] [-n NUM_REQUESTS]

Arguments:
    -h, --help                  Show the help message and exit.
    -v, --verbose               Enable verbose mode to display detailed information during processing.
    -f FILE, --file FILE        Path to the file containing the list of URLs to process. (Default: 'urls.txt')
    -o OUTPUT, --output OUTPUT  Output directory to save the generated PROMPT-RESPONSE pair files. (Default: 'tuningPairs')
    -m MODEL, --model MODEL     Specify the OpenAI GPT model to use. (Default: 'gpt-3.5-turbo')
    -n NUM_REQUESTS, --num_requests NUM_REQUESTS
                                Number of times each URL should be processed by chatGPT. (Default: 1)
Examples:
    1. Run the script with default settings:
        fetchTuningPairs.py

    2. Run the script with verbose mode and custom file and output folder:
        fetchTuningPairs.py -v -f custom_urls.txt -o custom_output

    3. Run the script with a specific model:
        fetchTuningPairs.py -m gpt-4

Notes:
  - Ensure you have set the OPENAI_API_KEY environment variable with your OpenAI API key.
  - The script uses the newspaper library to scrape text content from the URLs.
  - Generated prompt-response pairs are saved as individual text files based on the URL's domain and path.

"""
import os
import subprocess
import sys
import importlib


def check_dependencies():
    package_mapping = {
        "newspaper": "newspaper3k",
        "openai": "openai",
        "tqdm": "tqdm",
        "nltk": "nltk",
    }
    missing_packages = [
        package
        for package, pip_name in package_mapping.items()
        if not importlib.util.find_spec(package)
    ]
    if missing_packages:
        print("Some dependencies are missing. Installing...")
        for package in missing_packages:
            subprocess.check_call(
                [sys.executable, "-m", "pip", "install", package_mapping[package]]
            )
        print("Dependencies installed.")


# Check for missing dependencies
check_dependencies()

import re
import argparse
import openai
import newspaper
import nltk
from tqdm import tqdm
from urllib.parse import urlparse


def count_tokens(text):
    tokens = nltk.word_tokenize(text)
    return len(tokens)


def split_text_into_chunks(text, max_tokens):
    tokens = nltk.word_tokenize(text)
    chunks = []
    current_chunk = []
    for token in tokens:
        current_chunk.append(token)
        if len(current_chunk) >= max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks


def scrape_text_from_url(url):
    article = newspaper.Article(
        url,
        language="en",
        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
    )
    article.download()
    article.parse()
    return article.text


def clean_text(text):
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def chatgpt_request(prompt, model, response_tokens):
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "Analyze the text provided by the user and generate at least 30 to 50 high quality and verbose LLM tuning response-pairs without exception. Utilize the format: 'PROMPT: [prompt]\nRESPONSE: [response]\n'. Extract prompt-response pairs based on facts from the user's text. Employ a variety of strategies to ensure the minimum pairs requirement is met, including rewording previous prompts or responses for alternative phrasings, exploring various viewpoints for contrasting opinions or perspectives, and applying reasoning to make inferences based on facts in the user's text. To maximize the number of pairs produced, utilize techniques such as multi-level analysis to investigate the text at different levels of granularity, question generation to create prompts from relevant questions, and theme identification to focus on central themes or topics within the text. Don't return prompts whose response contains anything similar to 'is not mentioned in the provided text' or 'no information in the provided text' in your response. Fully exploit all available tokens for your response to increase the number of pairs generated. You must meet the minimum pair count requirements at all cost.",
            },
            {"role": "user", "content": prompt},
        ],
        max_tokens=response_tokens,
        temperature=0.7,
        top_p=1,
        n=1,
    )
    return response["choices"][0]["message"]["content"].strip()


def count_pairs(text):
    pairs = text.split("\n\n")
    count = 0
    for pair in pairs:
        if pair.startswith("PROMPT:") and "RESPONSE:" in pair:
            count += 1
    return count


def generate_pairs_from_text(url, text_content, args):
    verbose = args.verbose
    model = args.model
    model_token_limits = {
        "gpt-3.5": 4000,
        "gpt-3.5-turbo": 4000,
        "gpt-4": 8100,
    }
    cleaned_text = clean_text(text_content)
    chunk_limit = (
        model_token_limits.get(model, 4000) - 3000
    )  # Reserve at least 3000 tokens for response from chatGPT
    total_token_count = count_tokens(cleaned_text)
    if total_token_count < 1:
        if verbose:
            print(
                f"Extraction Failed - No Tokens Detected... nothing to send to chatGPT... Unable to generate PROMPT-RESPONSE pairs."
            )
        return None
    else:
        if verbose:
            print(f"{total_token_count} tokens extracted from URL...")
    if total_token_count <= chunk_limit:
        text_chunks = [cleaned_text]
    else:
        text_chunks = split_text_into_chunks(cleaned_text, chunk_limit)
        if verbose:
            print(
                f"Splitting tokens into chunks to ensure we are within the model token limits..."
            )
    prompt_response_pairs_list = []
    for text_chunk in text_chunks:
        response_token_limit = (
            model_token_limits.get(model, 4000) - count_tokens(text_chunk) - 200
        )  # 200 system prompt tokens
        if verbose:
            print(
                f"Sending {count_tokens(text_chunk)} tokens to {model} for processing...  Awaiting response..."
            )
        prompt_response_pairs = chatgpt_request(text_chunk, model, response_token_limit)
        if verbose:
            print(
                f"{count_pairs(prompt_response_pairs)} PROMPT-RESPONSE pairs generated."
            )
        prompt_response_pairs_list.append(prompt_response_pairs)
    return "\n".join(prompt_response_pairs_list)


def safe_filename_from_url(url):
    parsed = urlparse(url)
    parsed_filename = f"{parsed.netloc}.{parsed.path.replace('/', '.')}.txt"
    filename = re.sub(r"\.+", ".", parsed_filename)
    return re.sub(r"[^a-zA-Z0-9\.\-_]", "", filename)


def read_urls_from_file(file_path):
    if not os.path.exists(file_path):
        print(
            f"Error: File '{file_path}' does not exist. Please provide a valid file containing URLs."
        )
        sys.exit(1)
    with open(file_path, "r") as file:
        urls = file.readlines()
    return [url.strip() for url in urls]


def extract_unique_pairs(existing_pairs, new_pairs):
    unique_pairs = set(existing_pairs)
    for pair in new_pairs.split("\n\n"):
        if pair not in unique_pairs:
            unique_pairs.add(pair)
    return unique_pairs


def save_pairs_to_file(pairs, file_path, verbose):
    if pairs is None:
        if verbose:
            print(f"Nothing to save...\n")
    else:
        with open(file_path, "w") as file:
            file.write(pairs)
        if verbose:
            print(f"Save Successful.\n")


def parse_arguments():
    parser = argparse.ArgumentParser(
        description="fetchTuningPairs.py - Fetches LLM tuning PROMPT-RESPONSE pairs from textual content extracted from URLs."
    )

    parser.add_argument(
        "-v",
        "--verbose",
        action="store_true",
        help="Enable verbose output for more detailed information during execution.",
    )
    parser.add_argument(
        "-f",
        "--file",
        type=str,
        default="urls.txt",
        help="Path to the file containing the list of URLs to process.",
    )
    parser.add_argument(
        "-o",
        "--output",
        type=str,
        default="tuningPairs",
        help="Output directory to save the generated PROMPT-RESPONSE pair files.",
    )
    parser.add_argument(
        "-m",
        "--model",
        type=str,
        default="gpt-3.5-turbo",
        help="Specify the OpenAI GPT model to use.",
    )
    parser.add_argument(
        "-n",
        "--num_requests",
        type=int,
        default=1,
        help="Number of times each URL should be processed by chatGPT.",
    )

    return parser.parse_args()


def print_apikey_instructions_and_exit():
    print("The OPENAI_API_KEY environment variable is not set.")
    print("\nTo set the environment variable, follow these steps:")
    print("1. Open your terminal or command prompt.")
    print("2. Add the environment variable based on your operating system:")
    print(
        "   - For Linux/macOS, run terminal command: export OPENAI_API_KEY='your-api-key'"
    )
    print("   - For Windows, run terminal command: setx OPENAI_API_KEY 'your-api-key'")
    print("\nMake sure to replace 'your-api-key' with your actual API key.")
    print("After setting the environment variable try running the script again.")
    sys.exit(1)


def main():
    if not os.environ.get("OPENAI_API_KEY"):
        print_apikey_instructions_and_exit()
    else:
        openai.api_key = os.environ.get("OPENAI_API_KEY")
    args = parse_arguments()
    verbose = args.verbose
    file_of_urls = args.file
    output_dir = args.output
    model = args.model
    os.makedirs(output_dir, exist_ok=True)
    urls = read_urls_from_file(file_of_urls)
    total_urls = len(urls)
    if total_urls < 1:
        print(
            f"Error {file_of_urls} is empty. Please add URLS or provide an alternative URL file using the --file argument.  Use --help for more info."
        )
        sys.exit(1)
    bar_format = "{l_bar}▇{bar}░| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]"
    for i, url in enumerate(
        tqdm(
            urls, desc="Processing URLs", unit="URL", bar_format=bar_format, leave=True
        )
    ):
        if verbose:
            print(f"Processing URL {i + 1}/{total_urls}: {url}")
        prompt_response_pairs_set = set()

        # Scrape the text from the URL
        if verbose:
            print(f"Attempting to Extract text from URL...")
        text_content = scrape_text_from_url(url)

        for _ in range(args.num_requests):
            try:
                # Call the renamed function and pass the text_content
                prompt_response_pairs = generate_pairs_from_text(
                    url, text_content, args
                )

                # Extract unique pairs
                if verbose:
                    print(f"Ensuring unique prompt-response pairs... ")
                prompt_response_pairs_set = extract_unique_pairs(
                    prompt_response_pairs_set, prompt_response_pairs
                )
            except Exception as e:
                print(f"Error processing URL {url}: {e}")
        combined_pairs = "\n\n".join(prompt_response_pairs_set)
        safe_filename = safe_filename_from_url(url)
        output_file = os.path.join(output_dir, f"{safe_filename}")
        if verbose and combined_pairs != None:
            print(f"Saving prompt-response pairs to {output_file}")
        save_pairs_to_file(combined_pairs, output_file, verbose)
    print(f"======= Processing of {args.file} is complete. ======\n")


if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	"""
	fetchTuningPairs.py

	Author: Shane Kretzmann
	Email: ShaneKretzmann@gmail.com
	Gist: https://gist.github.com/uberdragon/b779de99e39af4dad91d5e1fc39221eb

	Description:
	This script fetches LLM tuning PROMPT-RESPONSE pairs from the textual content extracted from a list of URLs.
	The extracted text is sent to the chatGPT model for generating high-quality response pairs.
	The generated pairs are then saved to individual text files, with each file representing a URL.

	Usage:
	fetchTuningPairs.py [-h] [-v] [-f FILE] [-o OUTPUT] [-m MODEL] [-n NUM_REQUESTS]

	Arguments:
	-h, --help Show the help message and exit.
	-v, --verbose Enable verbose mode to display detailed information during processing.
	-f FILE, --file FILE Path to the file containing the list of URLs to process. (Default: 'urls.txt')
	-o OUTPUT, --output OUTPUT Output directory to save the generated PROMPT-RESPONSE pair files. (Default: 'tuningPairs')
	-m MODEL, --model MODEL Specify the OpenAI GPT model to use. (Default: 'gpt-3.5-turbo')
	-n NUM_REQUESTS, --num_requests NUM_REQUESTS
	Number of times each URL should be processed by chatGPT. (Default: 1)
	Examples:
	1. Run the script with default settings:
	fetchTuningPairs.py

	2. Run the script with verbose mode and custom file and output folder:
	fetchTuningPairs.py -v -f custom_urls.txt -o custom_output

	3. Run the script with a specific model:
	fetchTuningPairs.py -m gpt-4

	Notes:
	- Ensure you have set the OPENAI_API_KEY environment variable with your OpenAI API key.
	- The script uses the newspaper library to scrape text content from the URLs.
	- Generated prompt-response pairs are saved as individual text files based on the URL's domain and path.

	"""
	import os
	import subprocess
	import sys
	import importlib


	def check_dependencies():
	package_mapping = {
	"newspaper": "newspaper3k",
	"openai": "openai",
	"tqdm": "tqdm",
	"nltk": "nltk",
	}
	missing_packages = [
	package
	for package, pip_name in package_mapping.items()
	if not importlib.util.find_spec(package)
	]
	if missing_packages:
	print("Some dependencies are missing. Installing...")
	for package in missing_packages:
	subprocess.check_call(
	[sys.executable, "-m", "pip", "install", package_mapping[package]]
	)
	print("Dependencies installed.")


	# Check for missing dependencies
	check_dependencies()

	import re
	import argparse
	import openai
	import newspaper
	import nltk
	from tqdm import tqdm
	from urllib.parse import urlparse


	def count_tokens(text):
	tokens = nltk.word_tokenize(text)
	return len(tokens)


	def split_text_into_chunks(text, max_tokens):
	tokens = nltk.word_tokenize(text)
	chunks = []
	current_chunk = []
	for token in tokens:
	current_chunk.append(token)
	if len(current_chunk) >= max_tokens:
	chunks.append(" ".join(current_chunk))
	current_chunk = []
	if current_chunk:
	chunks.append(" ".join(current_chunk))
	return chunks


	def scrape_text_from_url(url):
	article = newspaper.Article(
	url,
	language="en",
	user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
	)
	article.download()
	article.parse()
	return article.text


	def clean_text(text):
	text = re.sub(r"\n", " ", text)
	text = re.sub(r"\s+", " ", text)
	return text.strip()


	def chatgpt_request(prompt, model, response_tokens):
	response = openai.ChatCompletion.create(
	model=model,
	messages=[
	{
	"role": "system",
	"content": "Analyze the text provided by the user and generate at least 30 to 50 high quality and verbose LLM tuning response-pairs without exception. Utilize the format: 'PROMPT: [prompt]\nRESPONSE: [response]\n'. Extract prompt-response pairs based on facts from the user's text. Employ a variety of strategies to ensure the minimum pairs requirement is met, including rewording previous prompts or responses for alternative phrasings, exploring various viewpoints for contrasting opinions or perspectives, and applying reasoning to make inferences based on facts in the user's text. To maximize the number of pairs produced, utilize techniques such as multi-level analysis to investigate the text at different levels of granularity, question generation to create prompts from relevant questions, and theme identification to focus on central themes or topics within the text. Don't return prompts whose response contains anything similar to 'is not mentioned in the provided text' or 'no information in the provided text' in your response. Fully exploit all available tokens for your response to increase the number of pairs generated. You must meet the minimum pair count requirements at all cost.",
	},
	{"role": "user", "content": prompt},
	],
	max_tokens=response_tokens,
	temperature=0.7,
	top_p=1,
	n=1,
	)
	return response["choices"][0]["message"]["content"].strip()


	def count_pairs(text):
	pairs = text.split("\n\n")
	count = 0
	for pair in pairs:
	if pair.startswith("PROMPT:") and "RESPONSE:" in pair:
	count += 1
	return count


	def generate_pairs_from_text(url, text_content, args):
	verbose = args.verbose
	model = args.model
	model_token_limits = {
	"gpt-3.5": 4000,
	"gpt-3.5-turbo": 4000,
	"gpt-4": 8100,
	}
	cleaned_text = clean_text(text_content)
	chunk_limit = (
	model_token_limits.get(model, 4000) - 3000
	) # Reserve at least 3000 tokens for response from chatGPT
	total_token_count = count_tokens(cleaned_text)
	if total_token_count < 1:
	if verbose:
	print(
	f"Extraction Failed - No Tokens Detected... nothing to send to chatGPT... Unable to generate PROMPT-RESPONSE pairs."
	)
	return None
	else:
	if verbose:
	print(f"{total_token_count} tokens extracted from URL...")
	if total_token_count <= chunk_limit:
	text_chunks = [cleaned_text]
	else:
	text_chunks = split_text_into_chunks(cleaned_text, chunk_limit)
	if verbose:
	print(
	f"Splitting tokens into chunks to ensure we are within the model token limits..."
	)
	prompt_response_pairs_list = []
	for text_chunk in text_chunks:
	response_token_limit = (
	model_token_limits.get(model, 4000) - count_tokens(text_chunk) - 200
	) # 200 system prompt tokens
	if verbose:
	print(
	f"Sending {count_tokens(text_chunk)} tokens to {model} for processing... Awaiting response..."
	)
	prompt_response_pairs = chatgpt_request(text_chunk, model, response_token_limit)
	if verbose:
	print(
	f"{count_pairs(prompt_response_pairs)} PROMPT-RESPONSE pairs generated."
	)
	prompt_response_pairs_list.append(prompt_response_pairs)
	return "\n".join(prompt_response_pairs_list)


	def safe_filename_from_url(url):
	parsed = urlparse(url)
	parsed_filename = f"{parsed.netloc}.{parsed.path.replace('/', '.')}.txt"
	filename = re.sub(r"\.+", ".", parsed_filename)
	return re.sub(r"[^a-zA-Z0-9\.\-_]", "", filename)


	def read_urls_from_file(file_path):
	if not os.path.exists(file_path):
	print(
	f"Error: File '{file_path}' does not exist. Please provide a valid file containing URLs."
	)
	sys.exit(1)
	with open(file_path, "r") as file:
	urls = file.readlines()
	return [url.strip() for url in urls]


	def extract_unique_pairs(existing_pairs, new_pairs):
	unique_pairs = set(existing_pairs)
	for pair in new_pairs.split("\n\n"):
	if pair not in unique_pairs:
	unique_pairs.add(pair)
	return unique_pairs


	def save_pairs_to_file(pairs, file_path, verbose):
	if pairs is None:
	if verbose:
	print(f"Nothing to save...\n")
	else:
	with open(file_path, "w") as file:
	file.write(pairs)
	if verbose:
	print(f"Save Successful.\n")


	def parse_arguments():
	parser = argparse.ArgumentParser(
	description="fetchTuningPairs.py - Fetches LLM tuning PROMPT-RESPONSE pairs from textual content extracted from URLs."
	)

	parser.add_argument(
	"-v",
	"--verbose",
	action="store_true",
	help="Enable verbose output for more detailed information during execution.",
	)
	parser.add_argument(
	"-f",
	"--file",
	type=str,
	default="urls.txt",
	help="Path to the file containing the list of URLs to process.",
	)
	parser.add_argument(
	"-o",
	"--output",
	type=str,
	default="tuningPairs",
	help="Output directory to save the generated PROMPT-RESPONSE pair files.",
	)
	parser.add_argument(
	"-m",
	"--model",
	type=str,
	default="gpt-3.5-turbo",
	help="Specify the OpenAI GPT model to use.",
	)
	parser.add_argument(
	"-n",
	"--num_requests",
	type=int,
	default=1,
	help="Number of times each URL should be processed by chatGPT.",
	)

	return parser.parse_args()


	def print_apikey_instructions_and_exit():
	print("The OPENAI_API_KEY environment variable is not set.")
	print("\nTo set the environment variable, follow these steps:")
	print("1. Open your terminal or command prompt.")
	print("2. Add the environment variable based on your operating system:")
	print(
	" - For Linux/macOS, run terminal command: export OPENAI_API_KEY='your-api-key'"
	)
	print(" - For Windows, run terminal command: setx OPENAI_API_KEY 'your-api-key'")
	print("\nMake sure to replace 'your-api-key' with your actual API key.")
	print("After setting the environment variable try running the script again.")
	sys.exit(1)


	def main():
	if not os.environ.get("OPENAI_API_KEY"):
	print_apikey_instructions_and_exit()
	else:
	openai.api_key = os.environ.get("OPENAI_API_KEY")
	args = parse_arguments()
	verbose = args.verbose
	file_of_urls = args.file
	output_dir = args.output
	model = args.model
	os.makedirs(output_dir, exist_ok=True)
	urls = read_urls_from_file(file_of_urls)
	total_urls = len(urls)
	if total_urls < 1:
	print(
	f"Error {file_of_urls} is empty. Please add URLS or provide an alternative URL file using the --file argument. Use --help for more info."
	)
	sys.exit(1)
	bar_format = "{l_bar}▇{bar}░\| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]"
	for i, url in enumerate(
	tqdm(
	urls, desc="Processing URLs", unit="URL", bar_format=bar_format, leave=True
	)
	):
	if verbose:
	print(f"Processing URL {i + 1}/{total_urls}: {url}")
	prompt_response_pairs_set = set()

	# Scrape the text from the URL
	if verbose:
	print(f"Attempting to Extract text from URL...")
	text_content = scrape_text_from_url(url)

	for _ in range(args.num_requests):
	try:
	# Call the renamed function and pass the text_content
	prompt_response_pairs = generate_pairs_from_text(
	url, text_content, args
	)

	# Extract unique pairs
	if verbose:
	print(f"Ensuring unique prompt-response pairs... ")
	prompt_response_pairs_set = extract_unique_pairs(
	prompt_response_pairs_set, prompt_response_pairs
	)
	except Exception as e:
	print(f"Error processing URL {url}: {e}")
	combined_pairs = "\n\n".join(prompt_response_pairs_set)
	safe_filename = safe_filename_from_url(url)
	output_file = os.path.join(output_dir, f"{safe_filename}")
	if verbose and combined_pairs != None:
	print(f"Saving prompt-response pairs to {output_file}")
	save_pairs_to_file(combined_pairs, output_file, verbose)
	print(f"======= Processing of {args.file} is complete. ======\n")


	if __name__ == "__main__":
	main()