Skip to content

Instantly share code, notes, and snippets.

@pszemraj
Last active February 18, 2024 00:49
Show Gist options
  • Save pszemraj/27dbe39ccb456d080de0de513973398a to your computer and use it in GitHub Desktop.
Save pszemraj/27dbe39ccb456d080de0de513973398a to your computer and use it in GitHub Desktop.
evaluate a text2text summarization model on cpu on 'the gauntlet' -rouge vs GPT4
import json
import logging
import re
from datetime import datetime
from pathlib import Path
import datasets
import evaluate
import fire
import intel_extension_for_pytorch as ipex
import numpy as np
import pandas as pd
import torch
import transformers
from datasets import Dataset
from textsum.summarize import Summarizer
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
GAUNTLET_URL = "https://www.dropbox.com/scl/fi/u3bjyjlb474tskbjyzmpg/gauntlet_w_ref_summaries.parquet?rlkey=qjsz6htflg77monh2y5jb3kya&dl=1"
_here = Path(__file__).parent
out_dir = _here / "rouge-vs-gpt4-gauntlet"
def shut_up():
datasets.utils.logging.set_verbosity(logging.ERROR)
transformers.utils.logging.set_verbosity(logging.ERROR)
shut_up()
def optimize_model(model):
return ipex.optimize(
model,
weights_prepack=False,
conv_bn_folding=False,
linear_bn_folding=False,
replace_dropout_with_identity=True,
auto_kernel_selection=True,
)
def word_count(text):
words = re.findall(r"\b\w+\b", text.lower())
return len(words)
def word_count_ratio(predictions, references):
if len(predictions) != len(references):
raise ValueError("Lists must have the same length.")
a = [word_count(t) for t in predictions]
b = [word_count(t) for t in references]
ratios = [x / y for x, y in zip(a, b)]
return round(np.mean(ratios), 3)
def save_results(results, model_name, output_dir):
# Format the current date and time to append to the file name
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# Extract a concise model identifier from the model name
model_identifier = model_name.split("/")[
-1
] # Assuming model name contains path segments
# Create the output directory if it doesn't exist
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Define the file name with semantic naming including model identifier and timestamp
output_file_name = f"rouge_results_{model_identifier}_{timestamp}.json"
output_file_path = output_dir / output_file_name
# Save the results to the output file
with open(output_file_path, "w", encoding="utf-8") as f:
json.dump(results, f, indent=4)
logging.info(f"Saved evaluation results to {output_file_path}")
return output_file_path
def evaluate_summaries(summarizer, dataset_url, output_dir, shuffle: bool = False):
rouge = evaluate.load("rouge")
gauntlet_ds = Dataset.from_pandas(pd.read_parquet(dataset_url))
if shuffle:
logging.info("Shuffling the dataset")
gauntlet_ds = gauntlet_ds.shuffle()
gauntlet_ds = gauntlet_ds.map(
lambda x: {"predicted_summary": summarizer(x["document_text"])},
batched=False,
)
results = rouge.compute(
predictions=gauntlet_ds["predicted_summary"],
references=gauntlet_ds["summary"],
use_aggregator=True,
)
results = {k: round(v * 100, 3) for k, v in results.items()}
results["word_count_ratio"] = word_count_ratio(
predictions=gauntlet_ds["predicted_summary"],
references=gauntlet_ds["summary"],
)
results["metadata"] = summarizer.config
print(json.dumps(results, indent=4))
# add the preds
predictions_out = [
{"filename": name, "summary": summ}
for name, summ in zip(
gauntlet_ds["source_doc_filename"], gauntlet_ds["predicted_summary"]
)
]
results["predicted_summaries"] = predictions_out
_ = save_results(results, summarizer.model_name_or_path, output_dir)
return results
def main(
model_name,
token_batch_length=16384,
num_beams=2,
dataset_url: str = None,
output_dir: str = None,
shuffle: bool = False,
):
"""
Generate and evaluate summaries using the specified model.
Args:
model_name (str): The name of the model to be used for summarization.
token_batch_length (int): The maximum number of tokens in a batch.
num_beams (int): The number of beams for beam search.
dataset_url (str, optional): The URL of the dataset to be summarized.
output_dir (str, optional): The directory to save the output summaries.
Returns:
None
"""
# Set default values if dataset_url and output_dir are not provided
dataset_url = dataset_url if dataset_url is not None else GAUNTLET_URL
output_dir = Path(output_dir) if output_dir is not None else out_dir
output_dir.mkdir(exist_ok=True)
# Initialize the Summarizer with the specified parameters
summarizer = Summarizer(
model_name,
token_batch_length=token_batch_length,
disable_progress_bar=True,
compile_model=False,
num_beams=num_beams,
)
summarizer.print_config()
# Apply general IPEX optimizations and compile the model
summarizer.model = optimize_model(summarizer.model)
summarizer.model = torch.compile(summarizer.model, backend="ipex")
# Evaluate the summaries and save the results to the output directory
_ = evaluate_summaries(summarizer, dataset_url, output_dir, shuffle=shuffle)
logging.info("Done!")
if __name__ == "__main__":
fire.Fire(main)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment