Skip to content

Instantly share code, notes, and snippets.

@ngaloppo
Last active August 1, 2023 22:24
Show Gist options
  • Save ngaloppo/6dfdcc9af209074f8e7c661b5355f0d9 to your computer and use it in GitHub Desktop.
Save ngaloppo/6dfdcc9af209074f8e7c661b5355f0d9 to your computer and use it in GitHub Desktop.
A script to quantize BART with HF's OVQuantizer
from functools import partial
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from optimum.intel import OVConfig, OVQuantizer
max_input_length = 512
max_target_length = 128
model_id = "lidiya/bart-large-xsum-samsum"
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
# The directory where the quantized model will be saved
save_dir = "ptq_model"
def preprocess_function(examples, tokenizer, max_input_length, max_target_length):
inputs = [doc for doc in examples["dialogue"]]
model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
# Setup the tokenizer for targets
with tokenizer.as_target_tokenizer():
labels = tokenizer(
examples["summary"], max_length=max_target_length, truncation=True
)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
# Load the default quantization configuration detailing the quantization we wish to apply
bart_config = {
"algorithm": "quantization",
"preset": "mixed",
"overflow_fix": "disable",
"initializer": {
"range": {"num_init_samples": 300, "type": "mean_min_max"},
"batchnorm_adaptation": {"num_bn_adaptation_samples": 0},
},
"scope_overrides": {"activations": {"{re}.*bmm_0": {"mode": "symmetric"}}},
"ignored_scopes": [
"{re}.*Embedding.*",
"{re}.*add___.*",
"{re}.*layer_norm_.*",
"{re}.*bmm_1",
],
}
quantization_config = OVConfig(compression=bart_config)
# Instantiate our OVQuantizer using the desired configuration
quantizer = OVQuantizer.from_pretrained(model)
# Create the calibration dataset used to perform static quantization
calibration_dataset = quantizer.get_calibration_dataset(
"samsum",
# dataset_config_name="samsum",
preprocess_function=partial(
preprocess_function,
tokenizer=tokenizer,
max_input_length=max_input_length,
max_target_length=max_target_length,
),
num_samples=300,
dataset_split="train",
)
# Apply static quantization and export the resulting quantized model to OpenVINO IR format
quantizer.quantize(
quantization_config=quantization_config,
calibration_dataset=calibration_dataset,
save_directory=save_dir,
)
# Save the tokenizer
tokenizer.save_pretrained(save_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment