yujiepan-work/ovmodel_pipeline_superb_ks.py

## ovmodel_pipeline_superb_ks.py
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

from collections import defaultdict
import time
import numpy as np
import pandas as pd
from multiprocessing import Pool

import datasets
import evaluate  # Use pip install git+https://github.com/huggingface/evaluate.git
from evaluate import evaluator
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, pipeline

from optimum.intel.openvino import OVModelForAudioClassification

MODEL_IDS = [
    "superb/wav2vec2-base-superb-ks",
    "anton-l/wav2vec2-base-finetuned-ks",
    "helenai/wav2vec2-base-superb-ks-jpqd-ov",
    # "yujiepan/internal.wav2vec2-base-superb-ks-int8-structured83",
    # "yujiepan/internal.wav2vec2-base-superb-ks-int8-structured79",
    # "yujiepan/internal.wav2vec2-base-superb-ks-int8-structured64-quantize-inputs",
    "yujiepan/internal.wav2vec2-base-superb-ks-int8-structured64-quantize-feature-extractor"
]

TORCH_MODELS = [
    "superb/wav2vec2-base-superb-ks",
    "anton-l/wav2vec2-base-finetuned-ks",
]

# split_ = 'validation'
split_ = 'test'
dataset = datasets.load_dataset("superb", "ks", split=split_)
dataset = dataset.select(range(3000)) # use a subset to just get the latency. please comment this line if you want the actual accuracy.

print('Evaluating ks ' + split_)


class Timer:
    def __init__(self) -> None:
        self.timer_start = defaultdict(list)
        self.timer_end = defaultdict(list)

    def report_dict(self):
        result = {}
        for key in ['preprocess', 'forward', 'postprocess']:
            if len(self.timer_start[key]) > 0:
                starts = np.array(self.timer_start[key])
                ends = np.array(self.timer_end[key])
                result[key + '_latency_in_ms'] = float(np.mean(ends - starts)) * 1000
        return result

    def add_perf_counter(self, ovpipe, enable=True):
        def log_time(fn, name):
            def foo(*args, **kwargs):
                start = time.perf_counter()
                self.timer_start[name].append(start)
                result = fn(*args, **kwargs)
                end = time.perf_counter()
                self.timer_end[name].append(end)
                return result
            return foo

        if enable:
            ovpipe.__class__.preprocess = log_time(ovpipe.__class__.preprocess, 'preprocess')
            ovpipe.__class__.forward = log_time(ovpipe.__class__.forward, 'forward')
            ovpipe.__class__.postprocess = log_time(ovpipe.__class__.postprocess, 'postprocess')


def inference(model_id):
    print(f'Inference on {model_id}...')
    batch_size = 1 # use 1 currently. Other values may cause errors because the dataset is non-divisible
    if model_id in TORCH_MODELS:
        model = AutoModelForAudioClassification.from_pretrained(model_id)
    else:
        model = OVModelForAudioClassification.from_pretrained(model_id)
    tokenizer = AutoFeatureExtractor.from_pretrained(model_id)
    ov_pipeline = pipeline("audio-classification", model=model, feature_extractor=tokenizer, batch_size=batch_size)

    timer = Timer()
    timer.add_perf_counter(ov_pipeline, enable=True)

    task_evaluator = evaluator("audio-classification")
    metric = evaluate.load('accuracy')
    ov_eval_results = task_evaluator.compute(
        model_or_pipeline=ov_pipeline,
        data=dataset,
        metric=metric,
        label_mapping=model.config.label2id,
    )
    ov_eval_results['latency_in_ms'] = ov_eval_results['latency_in_seconds'] * 1000
    ov_eval_results.update(timer.report_dict())
    return ov_eval_results


# if you just want the accuracy, use this multi-processing way to save total cost time
# n_process = len(MODEL_IDS)
# pool = Pool(n_process)
# records = pool.map(inference, MODEL_IDS)
# pool.close()
# pool.join()

# if you want the latency, use this single process way
records = list(map(inference, MODEL_IDS))

pd.set_option('max_colwidth', 100)
df = pd.DataFrame.from_records(records, index=MODEL_IDS)
print(df.to_string())
# df.to_csv(f'superb_ks_pipeline.csv')
	import os

	os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

	from collections import defaultdict
	import time
	import numpy as np
	import pandas as pd
	from multiprocessing import Pool

	import datasets
	import evaluate # Use pip install git+https://github.com/huggingface/evaluate.git
	from evaluate import evaluator
	from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, pipeline

	from optimum.intel.openvino import OVModelForAudioClassification

	MODEL_IDS = [
	"superb/wav2vec2-base-superb-ks",
	"anton-l/wav2vec2-base-finetuned-ks",
	"helenai/wav2vec2-base-superb-ks-jpqd-ov",
	# "yujiepan/internal.wav2vec2-base-superb-ks-int8-structured83",
	# "yujiepan/internal.wav2vec2-base-superb-ks-int8-structured79",
	# "yujiepan/internal.wav2vec2-base-superb-ks-int8-structured64-quantize-inputs",
	"yujiepan/internal.wav2vec2-base-superb-ks-int8-structured64-quantize-feature-extractor"
	]

	TORCH_MODELS = [
	"superb/wav2vec2-base-superb-ks",
	"anton-l/wav2vec2-base-finetuned-ks",
	]

	# split_ = 'validation'
	split_ = 'test'
	dataset = datasets.load_dataset("superb", "ks", split=split_)
	dataset = dataset.select(range(3000)) # use a subset to just get the latency. please comment this line if you want the actual accuracy.

	print('Evaluating ks ' + split_)



	class Timer:
	def __init__(self) -> None:
	self.timer_start = defaultdict(list)
	self.timer_end = defaultdict(list)

	def report_dict(self):
	result = {}
	for key in ['preprocess', 'forward', 'postprocess']:
	if len(self.timer_start[key]) > 0:
	starts = np.array(self.timer_start[key])
	ends = np.array(self.timer_end[key])
	result[key + '_latency_in_ms'] = float(np.mean(ends - starts)) * 1000
	return result

	def add_perf_counter(self, ovpipe, enable=True):
	def log_time(fn, name):
	def foo(args, *kwargs):
	start = time.perf_counter()
	self.timer_start[name].append(start)
	result = fn(args, *kwargs)
	end = time.perf_counter()
	self.timer_end[name].append(end)
	return result
	return foo

	if enable:
	ovpipe.__class__.preprocess = log_time(ovpipe.__class__.preprocess, 'preprocess')
	ovpipe.__class__.forward = log_time(ovpipe.__class__.forward, 'forward')
	ovpipe.__class__.postprocess = log_time(ovpipe.__class__.postprocess, 'postprocess')


	def inference(model_id):
	print(f'Inference on {model_id}...')
	batch_size = 1 # use 1 currently. Other values may cause errors because the dataset is non-divisible
	if model_id in TORCH_MODELS:
	model = AutoModelForAudioClassification.from_pretrained(model_id)
	else:
	model = OVModelForAudioClassification.from_pretrained(model_id)
	tokenizer = AutoFeatureExtractor.from_pretrained(model_id)
	ov_pipeline = pipeline("audio-classification", model=model, feature_extractor=tokenizer, batch_size=batch_size)

	timer = Timer()
	timer.add_perf_counter(ov_pipeline, enable=True)

	task_evaluator = evaluator("audio-classification")
	metric = evaluate.load('accuracy')
	ov_eval_results = task_evaluator.compute(
	model_or_pipeline=ov_pipeline,
	data=dataset,
	metric=metric,
	label_mapping=model.config.label2id,
	)
	ov_eval_results['latency_in_ms'] = ov_eval_results['latency_in_seconds'] * 1000
	ov_eval_results.update(timer.report_dict())
	return ov_eval_results


	# if you just want the accuracy, use this multi-processing way to save total cost time
	# n_process = len(MODEL_IDS)
	# pool = Pool(n_process)
	# records = pool.map(inference, MODEL_IDS)
	# pool.close()
	# pool.join()

	# if you want the latency, use this single process way
	records = list(map(inference, MODEL_IDS))

	pd.set_option('max_colwidth', 100)
	df = pd.DataFrame.from_records(records, index=MODEL_IDS)
	print(df.to_string())
	# df.to_csv(f'superb_ks_pipeline.csv')