Last active
February 18, 2024 00:49
-
-
Save pszemraj/27dbe39ccb456d080de0de513973398a to your computer and use it in GitHub Desktop.
evaluate a text2text summarization model on cpu on 'the gauntlet' -rouge vs GPT4
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import logging | |
import re | |
from datetime import datetime | |
from pathlib import Path | |
import datasets | |
import evaluate | |
import fire | |
import intel_extension_for_pytorch as ipex | |
import numpy as np | |
import pandas as pd | |
import torch | |
import transformers | |
from datasets import Dataset | |
from textsum.summarize import Summarizer | |
logging.basicConfig( | |
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" | |
) | |
GAUNTLET_URL = "https://www.dropbox.com/scl/fi/u3bjyjlb474tskbjyzmpg/gauntlet_w_ref_summaries.parquet?rlkey=qjsz6htflg77monh2y5jb3kya&dl=1" | |
_here = Path(__file__).parent | |
out_dir = _here / "rouge-vs-gpt4-gauntlet" | |
def shut_up(): | |
datasets.utils.logging.set_verbosity(logging.ERROR) | |
transformers.utils.logging.set_verbosity(logging.ERROR) | |
shut_up() | |
def optimize_model(model): | |
return ipex.optimize( | |
model, | |
weights_prepack=False, | |
conv_bn_folding=False, | |
linear_bn_folding=False, | |
replace_dropout_with_identity=True, | |
auto_kernel_selection=True, | |
) | |
def word_count(text): | |
words = re.findall(r"\b\w+\b", text.lower()) | |
return len(words) | |
def word_count_ratio(predictions, references): | |
if len(predictions) != len(references): | |
raise ValueError("Lists must have the same length.") | |
a = [word_count(t) for t in predictions] | |
b = [word_count(t) for t in references] | |
ratios = [x / y for x, y in zip(a, b)] | |
return round(np.mean(ratios), 3) | |
def save_results(results, model_name, output_dir): | |
# Format the current date and time to append to the file name | |
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") | |
# Extract a concise model identifier from the model name | |
model_identifier = model_name.split("/")[ | |
-1 | |
] # Assuming model name contains path segments | |
# Create the output directory if it doesn't exist | |
output_dir = Path(output_dir) | |
output_dir.mkdir(parents=True, exist_ok=True) | |
# Define the file name with semantic naming including model identifier and timestamp | |
output_file_name = f"rouge_results_{model_identifier}_{timestamp}.json" | |
output_file_path = output_dir / output_file_name | |
# Save the results to the output file | |
with open(output_file_path, "w", encoding="utf-8") as f: | |
json.dump(results, f, indent=4) | |
logging.info(f"Saved evaluation results to {output_file_path}") | |
return output_file_path | |
def evaluate_summaries(summarizer, dataset_url, output_dir, shuffle: bool = False): | |
rouge = evaluate.load("rouge") | |
gauntlet_ds = Dataset.from_pandas(pd.read_parquet(dataset_url)) | |
if shuffle: | |
logging.info("Shuffling the dataset") | |
gauntlet_ds = gauntlet_ds.shuffle() | |
gauntlet_ds = gauntlet_ds.map( | |
lambda x: {"predicted_summary": summarizer(x["document_text"])}, | |
batched=False, | |
) | |
results = rouge.compute( | |
predictions=gauntlet_ds["predicted_summary"], | |
references=gauntlet_ds["summary"], | |
use_aggregator=True, | |
) | |
results = {k: round(v * 100, 3) for k, v in results.items()} | |
results["word_count_ratio"] = word_count_ratio( | |
predictions=gauntlet_ds["predicted_summary"], | |
references=gauntlet_ds["summary"], | |
) | |
results["metadata"] = summarizer.config | |
print(json.dumps(results, indent=4)) | |
# add the preds | |
predictions_out = [ | |
{"filename": name, "summary": summ} | |
for name, summ in zip( | |
gauntlet_ds["source_doc_filename"], gauntlet_ds["predicted_summary"] | |
) | |
] | |
results["predicted_summaries"] = predictions_out | |
_ = save_results(results, summarizer.model_name_or_path, output_dir) | |
return results | |
def main( | |
model_name, | |
token_batch_length=16384, | |
num_beams=2, | |
dataset_url: str = None, | |
output_dir: str = None, | |
shuffle: bool = False, | |
): | |
""" | |
Generate and evaluate summaries using the specified model. | |
Args: | |
model_name (str): The name of the model to be used for summarization. | |
token_batch_length (int): The maximum number of tokens in a batch. | |
num_beams (int): The number of beams for beam search. | |
dataset_url (str, optional): The URL of the dataset to be summarized. | |
output_dir (str, optional): The directory to save the output summaries. | |
Returns: | |
None | |
""" | |
# Set default values if dataset_url and output_dir are not provided | |
dataset_url = dataset_url if dataset_url is not None else GAUNTLET_URL | |
output_dir = Path(output_dir) if output_dir is not None else out_dir | |
output_dir.mkdir(exist_ok=True) | |
# Initialize the Summarizer with the specified parameters | |
summarizer = Summarizer( | |
model_name, | |
token_batch_length=token_batch_length, | |
disable_progress_bar=True, | |
compile_model=False, | |
num_beams=num_beams, | |
) | |
summarizer.print_config() | |
# Apply general IPEX optimizations and compile the model | |
summarizer.model = optimize_model(summarizer.model) | |
summarizer.model = torch.compile(summarizer.model, backend="ipex") | |
# Evaluate the summaries and save the results to the output directory | |
_ = evaluate_summaries(summarizer, dataset_url, output_dir, shuffle=shuffle) | |
logging.info("Done!") | |
if __name__ == "__main__": | |
fire.Fire(main) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment