yujiepan-work/0000 optimum pipeline eval.md

## 0000 optimum pipeline eval.md

      
    Raw
  

              0000 optimum pipeline eval.md
            
          
OV_MODELS or TORCH_MODELS can be a model_id on huggingface or a local folder
For text tasks, there is a patch to ensure the sequence length is fixed. Can be removed if the model accepts arbitrary shape


## pipeline_food101.py
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

from optimum.intel.openvino import OVModelForImageClassification
import pandas as pd
import numpy as np

import datasets
import evaluate  # Use pip install git+https://github.com/huggingface/evaluate.git
from evaluate import evaluator
from transformers import AutoTokenizer, pipeline, AutoFeatureExtractor, AutoModelForImageClassification
from multiprocessing import Pool
from collections import defaultdict
import time


OV_MODELS = [
    # "skylord/swin-finetuned-food101",
    # "echarlaix/vit-food101-int8",
    # "helenai/swin-base-food101-jpqd-ov",
    # "vuiseng9/swin-base-food101-int8-structured43-15eph",
    "yujiepan/internal.swin-base-food101-int8-structured38.01",
]

TORCH_MODELS = [
    "skylord/swin-finetuned-food101",
]


def prepare_dataset():
    dataset = datasets.load_dataset('food101', split='validation')
    return dataset

def inference(model_id, dataset):
    print(f'Inference on {model_id}...')
    batch_size = 1  # use 1 currently. Other values may cause errors because the dataset is non-divisible
    if model_id in TORCH_MODELS:
        model = AutoModelForImageClassification.from_pretrained(model_id)
    else:
        model = OVModelForImageClassification.from_pretrained(model_id, compile=False)
        model.reshape(batch_size, 3, 224, 224)
        model.compile()
    tokenizer = AutoFeatureExtractor.from_pretrained(model_id)
    ov_pipeline = pipeline("image-classification", model=model, feature_extractor=tokenizer, batch_size=batch_size)

    task_evaluator = evaluator("image-classification")
    metric = evaluate.load('accuracy')
    ov_eval_results = task_evaluator.compute(
        model_or_pipeline=ov_pipeline,
        data=dataset,
        metric=metric,
        label_mapping=model.config.label2id,
    )

    ov_eval_results['latency_in_ms'] = ov_eval_results['latency_in_seconds'] * 1000
    print(model_id, ov_eval_results)
    return ov_eval_results


dataset = prepare_dataset()
all_models = OV_MODELS + TORCH_MODELS
records = [inference(model_id, dataset) for model_id in all_models]
pd.set_option('max_colwidth', 100)
df = pd.DataFrame.from_records(records, index=all_models)
print(df.to_string())

## pipeline_squadv1.py
from contextlib import contextmanager
from unittest.mock import patch

import datasets
import evaluate
from evaluate import evaluator
from transformers import AutoTokenizer, pipeline, AutoModelForQuestionAnswering
import pandas as pd

from optimum.intel.openvino import OVModelForQuestionAnswering

OV_MODELS = [
    "yujiepan/test.mobilebert-uncased-squadv1"  # or some local path
]

TORCH_MODELS = [
]


@contextmanager
def patch_tokenizer(tokenizer):
    # ensure the input is padded to a fixed length
    _original_call = tokenizer.__class__.__call__
    pad_on_right = tokenizer.padding_side == "right"

    def _new_call(self, *args, **kwargs):
        kwargs['max_length'] = 384
        kwargs['padding'] = 'max_length'
        kwargs['truncation'] = "only_second" if pad_on_right else "only_first"
        kwargs['return_overflowing_tokens'] = True
        kwargs['return_offsets_mapping'] = True
        kwargs['stride'] = 128
        return _original_call(self, *args, **kwargs)

    with patch('.'.join([_original_call.__module__, _original_call.__qualname__]), _new_call):
        yield


def prepare_dataset():
    # prepare dataset & evaluation metric
    dataset = datasets.load_dataset("squad", split='validation')
    return dataset


def inference(model_id, dataset):
    print(f'Inference on {model_id}...')
    if model_id in OV_MODELS:
        model = OVModelForQuestionAnswering.from_pretrained(model_id)
    else:
        model = AutoModelForQuestionAnswering.from_pretrained(model_id)
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    ov_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

    # inference
    qa_eval = evaluator("question-answering")
    with patch_tokenizer(tokenizer):
        metric = evaluate.load('squad')
        ov_eval_results = qa_eval.compute(
            model_or_pipeline=ov_pipeline,
            data=dataset,
            metric=metric,
        )
    print(model_id, ov_eval_results)
    return ov_eval_results


dataset = prepare_dataset()
all_models = OV_MODELS + TORCH_MODELS
records = [inference(model_id, dataset) for model_id in all_models]
pd.set_option('max_colwidth', 100)
df = pd.DataFrame.from_records(records, index=all_models)
print(df.to_string())

## pipeline_sst2.py
from contextlib import contextmanager
from unittest.mock import patch

from optimum.intel.openvino import OVModelForSequenceClassification
import pandas as pd

import datasets
import evaluate
from evaluate import evaluator
from transformers import AutoTokenizer, pipeline, AutoModelForSequenceClassification

TASK_NAME = "sst2"
OV_MODELS = [
    # "yujiepan/bert-base-uncased-sst2",
    # "yujiepan/bert-base-uncased-sst2-PTQ",
    # "yujiepan/bert-base-uncased-sst2-int8-unstructured80-17epoch",
    "yujiepan/bert-base-uncased-sst2-int8-unstructured80-30epoch",
]

TORCH_MODELS = []

@contextmanager
def patch_tokenizer(tokenizer):
    # ensure the input is padded to a fixed length
    _original_call = tokenizer.__class__.__call__

    def _new_call(self, *args, **kwargs):
        kwargs['max_length'] = 128
        kwargs['padding'] = 'max_length'
        kwargs['truncation'] = True
        return _original_call(self, *args, **kwargs)

    with patch('.'.join([_original_call.__module__, _original_call.__qualname__]), _new_call):
        yield


def prepare_dataset():
    # prepare dataset & evaluation metric
    dataset = datasets.load_dataset("glue", TASK_NAME)
    labels = dataset['train'].features['label'].names
    label2id = dict(zip(labels, range(len(labels))))
    id2label = dict(zip(range(len(labels)), labels))
    task_to_keys = {
        "cola": ("sentence", None),
        "mnli": ("premise", "hypothesis"),
        "mnli-mm": ("premise", "hypothesis"),
        "mrpc": ("sentence1", "sentence2"),
        "qnli": ("question", "sentence"),
        "qqp": ("question1", "question2"),
        "rte": ("sentence1", "sentence2"),
        "sst2": ("sentence", None),
        "stsb": ("sentence1", "sentence2"),
        "wnli": ("sentence1", "sentence2"),
    }
    input_column = task_to_keys[TASK_NAME][0]
    return dataset, label2id, input_column


def inference(model_id):
    print(f'Inference on {model_id}...')

    # prepare pipeline
    if model_id in OV_MODELS:
        optimized_model = OVModelForSequenceClassification.from_pretrained(model_id)
    else:
        optimized_model = AutoModelForSequenceClassification.from_pretrained(model_id)
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    ov_sst2_pipeline = pipeline("text-classification", model=optimized_model, tokenizer=tokenizer)

    # inference
    glue_eval = evaluator("text-classification")
    with patch_tokenizer(tokenizer):
        metric = evaluate.load('glue', TASK_NAME)
        ov_eval_results = glue_eval.compute(
            model_or_pipeline=ov_sst2_pipeline,
            data=dataset['validation'],
            metric=metric,
            input_column=input_column,
            label_mapping=label2id if optimized_model.config.label2id == label2id else None,
        )
        print(model_id, ov_eval_results)
    return ov_eval_results


dataset, label2id, input_column = prepare_dataset()
all_models = OV_MODELS + TORCH_MODELS
records = [inference(model_id) for model_id in all_models]
pd.set_option('max_colwidth', 100)
df = pd.DataFrame.from_records(records, index=all_models)
print(df)

## pipeline_superbks.py
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

import datasets
import evaluate  # Use pip install git+https://github.com/huggingface/evaluate.git
from evaluate import evaluator
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, pipeline

from collections import defaultdict
import time
import numpy as np
import pandas as pd
from multiprocessing import Pool


from optimum.intel.openvino import OVModelForAudioClassification


OV_MODELS = [
    # "superb/wav2vec2-base-superb-ks",
    # "anton-l/wav2vec2-base-finetuned-ks",
    # "helenai/wav2vec2-base-superb-ks-jpqd-ov",
    # "yujiepan/internal.wav2vec2-base-superb-ks-int8-structured83",
    # "yujiepan/internal.wav2vec2-base-superb-ks-int8-structured79",
    # "yujiepan/internal.wav2vec2-base-superb-ks-int8-structured64-quantize-inputs",
    "yujiepan/internal.wav2vec2-base-superb-ks-int8-structured64-quantize-feature-extractor"
]

TORCH_MODELS = [
    # "superb/wav2vec2-base-superb-ks",
    "anton-l/wav2vec2-base-finetuned-ks",
]


def prepare_dataset():
    split_ = 'validation'
    dataset = datasets.load_dataset("superb", "ks", split=split_)
    # dataset = dataset.select(range(3000)) # use a subset to just get the latency. please comment this line if you want the actual accuracy.
    return dataset


def inference(model_id, dataset):
    print(f'Inference on {model_id}...')
    batch_size = 1  # use 1 currently. Other values may cause errors because the dataset is non-divisible
    if model_id in TORCH_MODELS:
        model = AutoModelForAudioClassification.from_pretrained(model_id)
    else:
        model = OVModelForAudioClassification.from_pretrained(model_id)
    tokenizer = AutoFeatureExtractor.from_pretrained(model_id)
    ov_pipeline = pipeline("audio-classification", model=model, feature_extractor=tokenizer, batch_size=batch_size)

    task_evaluator = evaluator("audio-classification")
    metric = evaluate.load('accuracy')
    ov_eval_results = task_evaluator.compute(
        model_or_pipeline=ov_pipeline,
        data=dataset,
        metric=metric,
        label_mapping=model.config.label2id,
    )
    ov_eval_results['latency_in_ms'] = ov_eval_results['latency_in_seconds'] * 1000
    print(model_id, ov_eval_results)
    return ov_eval_results


dataset = prepare_dataset()
all_models = OV_MODELS + TORCH_MODELS
records = [inference(model_id, dataset) for model_id in all_models]
pd.set_option('max_colwidth', 100)
df = pd.DataFrame.from_records(records, index=all_models)
print(df.to_string())
	import os
	os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

	from optimum.intel.openvino import OVModelForImageClassification
	import pandas as pd
	import numpy as np

	import datasets
	import evaluate # Use pip install git+https://github.com/huggingface/evaluate.git
	from evaluate import evaluator
	from transformers import AutoTokenizer, pipeline, AutoFeatureExtractor, AutoModelForImageClassification
	from multiprocessing import Pool
	from collections import defaultdict
	import time


	OV_MODELS = [
	# "skylord/swin-finetuned-food101",
	# "echarlaix/vit-food101-int8",
	# "helenai/swin-base-food101-jpqd-ov",
	# "vuiseng9/swin-base-food101-int8-structured43-15eph",
	"yujiepan/internal.swin-base-food101-int8-structured38.01",
	]

	TORCH_MODELS = [
	"skylord/swin-finetuned-food101",
	]


	def prepare_dataset():
	dataset = datasets.load_dataset('food101', split='validation')
	return dataset

	def inference(model_id, dataset):
	print(f'Inference on {model_id}...')
	batch_size = 1 # use 1 currently. Other values may cause errors because the dataset is non-divisible
	if model_id in TORCH_MODELS:
	model = AutoModelForImageClassification.from_pretrained(model_id)
	else:
	model = OVModelForImageClassification.from_pretrained(model_id, compile=False)
	model.reshape(batch_size, 3, 224, 224)
	model.compile()
	tokenizer = AutoFeatureExtractor.from_pretrained(model_id)
	ov_pipeline = pipeline("image-classification", model=model, feature_extractor=tokenizer, batch_size=batch_size)

	task_evaluator = evaluator("image-classification")
	metric = evaluate.load('accuracy')
	ov_eval_results = task_evaluator.compute(
	model_or_pipeline=ov_pipeline,
	data=dataset,
	metric=metric,
	label_mapping=model.config.label2id,
	)

	ov_eval_results['latency_in_ms'] = ov_eval_results['latency_in_seconds'] * 1000
	print(model_id, ov_eval_results)
	return ov_eval_results


	dataset = prepare_dataset()
	all_models = OV_MODELS + TORCH_MODELS
	records = [inference(model_id, dataset) for model_id in all_models]
	pd.set_option('max_colwidth', 100)
	df = pd.DataFrame.from_records(records, index=all_models)
	print(df.to_string())
	from contextlib import contextmanager
	from unittest.mock import patch

	import datasets
	import evaluate
	from evaluate import evaluator
	from transformers import AutoTokenizer, pipeline, AutoModelForQuestionAnswering
	import pandas as pd

	from optimum.intel.openvino import OVModelForQuestionAnswering

	OV_MODELS = [
	"yujiepan/test.mobilebert-uncased-squadv1" # or some local path
	]

	TORCH_MODELS = [
	]


	@contextmanager
	def patch_tokenizer(tokenizer):
	# ensure the input is padded to a fixed length
	_original_call = tokenizer.__class__.__call__
	pad_on_right = tokenizer.padding_side == "right"

	def _new_call(self, args, *kwargs):
	kwargs['max_length'] = 384
	kwargs['padding'] = 'max_length'
	kwargs['truncation'] = "only_second" if pad_on_right else "only_first"
	kwargs['return_overflowing_tokens'] = True
	kwargs['return_offsets_mapping'] = True
	kwargs['stride'] = 128
	return _original_call(self, args, *kwargs)

	with patch('.'.join([_original_call.__module__, _original_call.__qualname__]), _new_call):
	yield


	def prepare_dataset():
	# prepare dataset & evaluation metric
	dataset = datasets.load_dataset("squad", split='validation')
	return dataset


	def inference(model_id, dataset):
	print(f'Inference on {model_id}...')
	if model_id in OV_MODELS:
	model = OVModelForQuestionAnswering.from_pretrained(model_id)
	else:
	model = AutoModelForQuestionAnswering.from_pretrained(model_id)
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	ov_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

	# inference
	qa_eval = evaluator("question-answering")
	with patch_tokenizer(tokenizer):
	metric = evaluate.load('squad')
	ov_eval_results = qa_eval.compute(
	model_or_pipeline=ov_pipeline,
	data=dataset,
	metric=metric,
	)
	print(model_id, ov_eval_results)
	return ov_eval_results


	dataset = prepare_dataset()
	all_models = OV_MODELS + TORCH_MODELS
	records = [inference(model_id, dataset) for model_id in all_models]
	pd.set_option('max_colwidth', 100)
	df = pd.DataFrame.from_records(records, index=all_models)
	print(df.to_string())