Skip to content

Instantly share code, notes, and snippets.

@yujiepan-work
Last active March 8, 2023 12:53
Show Gist options
  • Save yujiepan-work/c38dc4e56c7a9d803c42988f7b7d260a to your computer and use it in GitHub Desktop.
Save yujiepan-work/c38dc4e56c7a9d803c42988f7b7d260a to your computer and use it in GitHub Desktop.
from contextlib import contextmanager
from unittest.mock import patch
from optimum.intel.openvino import OVModelForSequenceClassification
import pandas as pd
import datasets
import evaluate
from evaluate import evaluator
from transformers import AutoTokenizer, pipeline
TASK_NAME = "sst2"
MODEL_IDS = [
"yujiepan/bert-base-uncased-sst2",
"yujiepan/bert-base-uncased-sst2-PTQ",
"yujiepan/bert-base-uncased-sst2-int8-unstructured80-17epoch",
"yujiepan/bert-base-uncased-sst2-int8-unstructured80-30epoch",
]
@contextmanager
def patch_tokenizer(tokenizer):
# ensure the input is padded to a fixed length
_original_call = tokenizer.__class__.__call__
def _new_call(self, *args, **kwargs):
kwargs['max_length'] = 128
kwargs['padding'] = 'max_length'
kwargs['truncation'] = True
return _original_call(self, *args, **kwargs)
with patch('.'.join([_original_call.__module__, _original_call.__qualname__]), _new_call):
yield
def prepare_dataset():
# prepare dataset & evaluation metric
dataset = datasets.load_dataset("glue", TASK_NAME)
labels = dataset['train'].features['label'].names
label2id = dict(zip(labels, range(len(labels))))
id2label = dict(zip(range(len(labels)), labels))
task_to_keys = {
"cola": ("sentence", None),
"mnli": ("premise", "hypothesis"),
"mnli-mm": ("premise", "hypothesis"),
"mrpc": ("sentence1", "sentence2"),
"qnli": ("question", "sentence"),
"qqp": ("question1", "question2"),
"rte": ("sentence1", "sentence2"),
"sst2": ("sentence", None),
"stsb": ("sentence1", "sentence2"),
"wnli": ("sentence1", "sentence2"),
}
input_column = task_to_keys[TASK_NAME][0]
return dataset, label2id, input_column
def inference(model_id):
print(f'Inference on {model_id}...')
# prepare pipeline
optimized_model = OVModelForSequenceClassification.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
ov_sst2_pipeline = pipeline("text-classification", model=optimized_model, tokenizer=tokenizer)
# inference
glue_eval = evaluator("text-classification")
with patch_tokenizer(tokenizer):
metric = evaluate.load('glue', TASK_NAME)
ov_eval_results = glue_eval.compute(
model_or_pipeline=ov_sst2_pipeline,
data=dataset['validation'],
metric=metric,
input_column=input_column,
label_mapping=label2id if optimized_model.config.label2id == label2id else None,
)
return ov_eval_results
dataset, label2id, input_column = prepare_dataset()
records = [inference(model_id) for model_id in MODEL_IDS]
pd.set_option('max_colwidth', 100)
df = pd.DataFrame.from_records(records, index=MODEL_IDS)
print(df)
# df.to_csv('ovmodel_inference.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment