pszemraj/run_gauntlet_vs_gpt4.py

## run_gauntlet_vs_gpt4.py
import json
import logging
import re
from datetime import datetime
from pathlib import Path

import datasets
import evaluate
import fire
import intel_extension_for_pytorch as ipex
import numpy as np
import pandas as pd
import torch
import transformers
from datasets import Dataset
from textsum.summarize import Summarizer

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)


GAUNTLET_URL = "https://www.dropbox.com/scl/fi/u3bjyjlb474tskbjyzmpg/gauntlet_w_ref_summaries.parquet?rlkey=qjsz6htflg77monh2y5jb3kya&dl=1"

_here = Path(__file__).parent

out_dir = _here / "rouge-vs-gpt4-gauntlet"


def shut_up():
    datasets.utils.logging.set_verbosity(logging.ERROR)
    transformers.utils.logging.set_verbosity(logging.ERROR)


shut_up()


def optimize_model(model):
    return ipex.optimize(
        model,
        weights_prepack=False,
        conv_bn_folding=False,
        linear_bn_folding=False,
        replace_dropout_with_identity=True,
        auto_kernel_selection=True,
    )


def word_count(text):
    words = re.findall(r"\b\w+\b", text.lower())
    return len(words)


def word_count_ratio(predictions, references):
    if len(predictions) != len(references):
        raise ValueError("Lists must have the same length.")

    a = [word_count(t) for t in predictions]
    b = [word_count(t) for t in references]
    ratios = [x / y for x, y in zip(a, b)]
    return round(np.mean(ratios), 3)


def save_results(results, model_name, output_dir):
    # Format the current date and time to append to the file name
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    # Extract a concise model identifier from the model name
    model_identifier = model_name.split("/")[
        -1
    ]  # Assuming model name contains path segments

    # Create the output directory if it doesn't exist
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    # Define the file name with semantic naming including model identifier and timestamp
    output_file_name = f"rouge_results_{model_identifier}_{timestamp}.json"
    output_file_path = output_dir / output_file_name

    # Save the results to the output file
    with open(output_file_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=4)

    logging.info(f"Saved evaluation results to {output_file_path}")

    return output_file_path


def evaluate_summaries(summarizer, dataset_url, output_dir, shuffle: bool = False):
    rouge = evaluate.load("rouge")
    gauntlet_ds = Dataset.from_pandas(pd.read_parquet(dataset_url))

    if shuffle:
        logging.info("Shuffling the dataset")
        gauntlet_ds = gauntlet_ds.shuffle()

    gauntlet_ds = gauntlet_ds.map(
        lambda x: {"predicted_summary": summarizer(x["document_text"])},
        batched=False,
    )

    results = rouge.compute(
        predictions=gauntlet_ds["predicted_summary"],
        references=gauntlet_ds["summary"],
        use_aggregator=True,
    )
    results = {k: round(v * 100, 3) for k, v in results.items()}
    results["word_count_ratio"] = word_count_ratio(
        predictions=gauntlet_ds["predicted_summary"],
        references=gauntlet_ds["summary"],
    )
    results["metadata"] = summarizer.config

    print(json.dumps(results, indent=4))

    # add the preds
    predictions_out = [
        {"filename": name, "summary": summ}
        for name, summ in zip(
            gauntlet_ds["source_doc_filename"], gauntlet_ds["predicted_summary"]
        )
    ]

    results["predicted_summaries"] = predictions_out
    _ = save_results(results, summarizer.model_name_or_path, output_dir)

    return results


def main(
    model_name,
    token_batch_length=16384,
    num_beams=2,
    dataset_url: str = None,
    output_dir: str = None,
    shuffle: bool = False,
):
    """
    Generate and evaluate summaries using the specified model.

    Args:
        model_name (str): The name of the model to be used for summarization.
        token_batch_length (int): The maximum number of tokens in a batch.
        num_beams (int): The number of beams for beam search.
        dataset_url (str, optional): The URL of the dataset to be summarized.
        output_dir (str, optional): The directory to save the output summaries.

    Returns:
        None
    """
    # Set default values if dataset_url and output_dir are not provided
    dataset_url = dataset_url if dataset_url is not None else GAUNTLET_URL
    output_dir = Path(output_dir) if output_dir is not None else out_dir
    output_dir.mkdir(exist_ok=True)

    # Initialize the Summarizer with the specified parameters
    summarizer = Summarizer(
        model_name,
        token_batch_length=token_batch_length,
        disable_progress_bar=True,
        compile_model=False,
        num_beams=num_beams,
    )
    summarizer.print_config()

    # Apply general IPEX optimizations and compile the model
    summarizer.model = optimize_model(summarizer.model)
    summarizer.model = torch.compile(summarizer.model, backend="ipex")

    # Evaluate the summaries and save the results to the output directory
    _ = evaluate_summaries(summarizer, dataset_url, output_dir, shuffle=shuffle)
    logging.info("Done!")


if __name__ == "__main__":
    fire.Fire(main)
	import json
	import logging
	import re
	from datetime import datetime
	from pathlib import Path

	import datasets
	import evaluate
	import fire
	import intel_extension_for_pytorch as ipex
	import numpy as np
	import pandas as pd
	import torch
	import transformers
	from datasets import Dataset
	from textsum.summarize import Summarizer

	logging.basicConfig(
	level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
	)


	GAUNTLET_URL = "https://www.dropbox.com/scl/fi/u3bjyjlb474tskbjyzmpg/gauntlet_w_ref_summaries.parquet?rlkey=qjsz6htflg77monh2y5jb3kya&dl=1"

	_here = Path(__file__).parent

	out_dir = _here / "rouge-vs-gpt4-gauntlet"


	def shut_up():
	datasets.utils.logging.set_verbosity(logging.ERROR)
	transformers.utils.logging.set_verbosity(logging.ERROR)


	shut_up()


	def optimize_model(model):
	return ipex.optimize(
	model,
	weights_prepack=False,
	conv_bn_folding=False,
	linear_bn_folding=False,
	replace_dropout_with_identity=True,
	auto_kernel_selection=True,
	)


	def word_count(text):
	words = re.findall(r"\b\w+\b", text.lower())
	return len(words)


	def word_count_ratio(predictions, references):
	if len(predictions) != len(references):
	raise ValueError("Lists must have the same length.")

	a = [word_count(t) for t in predictions]
	b = [word_count(t) for t in references]
	ratios = [x / y for x, y in zip(a, b)]
	return round(np.mean(ratios), 3)


	def save_results(results, model_name, output_dir):
	# Format the current date and time to append to the file name
	timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
	# Extract a concise model identifier from the model name
	model_identifier = model_name.split("/")[
	-1
	] # Assuming model name contains path segments

	# Create the output directory if it doesn't exist
	output_dir = Path(output_dir)
	output_dir.mkdir(parents=True, exist_ok=True)

	# Define the file name with semantic naming including model identifier and timestamp
	output_file_name = f"rouge_results_{model_identifier}_{timestamp}.json"
	output_file_path = output_dir / output_file_name

	# Save the results to the output file
	with open(output_file_path, "w", encoding="utf-8") as f:
	json.dump(results, f, indent=4)

	logging.info(f"Saved evaluation results to {output_file_path}")

	return output_file_path


	def evaluate_summaries(summarizer, dataset_url, output_dir, shuffle: bool = False):
	rouge = evaluate.load("rouge")
	gauntlet_ds = Dataset.from_pandas(pd.read_parquet(dataset_url))

	if shuffle:
	logging.info("Shuffling the dataset")
	gauntlet_ds = gauntlet_ds.shuffle()

	gauntlet_ds = gauntlet_ds.map(
	lambda x: {"predicted_summary": summarizer(x["document_text"])},
	batched=False,
	)

	results = rouge.compute(
	predictions=gauntlet_ds["predicted_summary"],
	references=gauntlet_ds["summary"],
	use_aggregator=True,
	)
	results = {k: round(v * 100, 3) for k, v in results.items()}
	results["word_count_ratio"] = word_count_ratio(
	predictions=gauntlet_ds["predicted_summary"],
	references=gauntlet_ds["summary"],
	)
	results["metadata"] = summarizer.config

	print(json.dumps(results, indent=4))

	# add the preds
	predictions_out = [
	{"filename": name, "summary": summ}
	for name, summ in zip(
	gauntlet_ds["source_doc_filename"], gauntlet_ds["predicted_summary"]
	)
	]

	results["predicted_summaries"] = predictions_out
	_ = save_results(results, summarizer.model_name_or_path, output_dir)

	return results


	def main(
	model_name,
	token_batch_length=16384,
	num_beams=2,
	dataset_url: str = None,
	output_dir: str = None,
	shuffle: bool = False,
	):
	"""
	Generate and evaluate summaries using the specified model.

	Args:
	model_name (str): The name of the model to be used for summarization.
	token_batch_length (int): The maximum number of tokens in a batch.
	num_beams (int): The number of beams for beam search.
	dataset_url (str, optional): The URL of the dataset to be summarized.
	output_dir (str, optional): The directory to save the output summaries.

	Returns:
	None
	"""
	# Set default values if dataset_url and output_dir are not provided
	dataset_url = dataset_url if dataset_url is not None else GAUNTLET_URL
	output_dir = Path(output_dir) if output_dir is not None else out_dir
	output_dir.mkdir(exist_ok=True)

	# Initialize the Summarizer with the specified parameters
	summarizer = Summarizer(
	model_name,
	token_batch_length=token_batch_length,
	disable_progress_bar=True,
	compile_model=False,
	num_beams=num_beams,
	)
	summarizer.print_config()

	# Apply general IPEX optimizations and compile the model
	summarizer.model = optimize_model(summarizer.model)
	summarizer.model = torch.compile(summarizer.model, backend="ipex")

	# Evaluate the summaries and save the results to the output directory
	_ = evaluate_summaries(summarizer, dataset_url, output_dir, shuffle=shuffle)
	logging.info("Done!")


	if __name__ == "__main__":
	fire.Fire(main)