Created
October 16, 2023 08:30
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import copy | |
import os | |
os.environ['CUDA_VISIBLE_DEVICES'] = '-1' | |
from functools import partial | |
from pathlib import Path | |
import json | |
from dataclasses import dataclass | |
import typing | |
import torch | |
import transformers | |
from transformers import HfArgumentParser | |
from optimum.intel.openvino import OVConfig, OVQuantizer | |
from openvino.runtime import Core | |
@dataclass | |
class Args: | |
model_id: str | |
save_dir: str | |
sparsity: float | |
W8A8_QUANTIZATION_CONFIG = { | |
"algorithm": "quantization", | |
"preset": "performance", # sym weight, sym activation | |
"overflow_fix": "disable", | |
"initializer": { | |
"range": {"num_init_samples": 16, "type": "min_max"}, | |
"batchnorm_adaptation": {"num_bn_adaptation_samples": 0}, | |
}, | |
"ignored_scopes": [ | |
"{re}.*Embedding.*", | |
"{re}.*add___.*", | |
"{re}.*norm.*", # layer norm. Models other than llama/opt might need to set another name. | |
# "{re}.*matmul_1", | |
# "{re}.*__truediv__.*", | |
], | |
} | |
ie = Core() | |
def _model_has_layer_norm(model): | |
for name, _ in model.named_parameters(): | |
if 'layernorm' in name.lower().replace('_', ''): | |
return True | |
return False | |
@torch.no_grad() | |
def _sparsify_param(param: torch.nn.Parameter, sparsity: float): | |
k = min(param.numel(), int(param.numel() * sparsity) + 1) | |
abs_data = param.data.abs() | |
threshold = float(torch.kthvalue(abs_data.view(-1), k).values) | |
param.data[abs_data <= threshold] = 0. | |
def _get_named_linear_layers(model: transformers.PreTrainedModel): | |
if model.__class__.__name__ == 'OPTForCausalLM': | |
for name, module in model.named_modules(): | |
if isinstance(module, torch.nn.Linear) and name.startswith('model.decoder.layers.'): | |
yield name, module | |
elif model.__class__.__name__ == 'LlamaForCausalLM': | |
for name, module in model.named_modules(): | |
if isinstance(module, torch.nn.Linear) and name.startswith('model.layers.'): | |
yield name, module | |
else: | |
raise NotImplementedError(model.__class__.__name__) | |
def sparsify(model: transformers.PreTrainedModel, sparsity: float): | |
for name, module in _get_named_linear_layers(model): | |
module: torch.nn.Linear | |
_sparsify_param(module.weight, sparsity=sparsity) | |
if module.bias is not None: | |
_sparsify_param(module.bias, sparsity=sparsity) | |
assert (module.weight == 0).float().mean() >= sparsity * 0.95 | |
return model | |
def quantize_and_save(model: transformers.PreTrainedModel, tokenizer, save_dir): | |
if hasattr(model, "transformer") and hasattr(model.transformer, "wte") \ | |
and type(model.transformer.wte) != torch.nn.Embedding: | |
print(model.config_class) | |
from nncf.torch import register_module | |
register_module(ignored_algorithms=[])(type(model.transformer.wte)) | |
def preprocess_fn(examples, tokenizer): | |
return tokenizer(examples["text"], padding=True, truncation=True, max_length=32) | |
compression_config = copy.deepcopy(W8A8_QUANTIZATION_CONFIG) | |
if not _model_has_layer_norm(model): | |
print('Deleting ignored_scope of layer_norm since the model does not have layernorm') | |
compression_config['ignored_scopes'].remove("{re}.*layer_norm_.*") | |
quantizer = OVQuantizer.from_pretrained(model) | |
calibration_dataset = quantizer.get_calibration_dataset( | |
"wikitext", | |
dataset_config_name="wikitext-2-v1", | |
num_samples=20, | |
dataset_split='train', | |
preprocess_function=partial(preprocess_fn, tokenizer=tokenizer) | |
) | |
ov_config = OVConfig(compression_config) | |
ov_config.log_dir = str(save_dir) | |
quantizer.quantize( | |
calibration_dataset=calibration_dataset, | |
save_directory=save_dir, | |
quantization_config=ov_config, | |
) | |
tokenizer.save_pretrained(save_dir) | |
def check_ov_w8a8_ir_sparsity(ov_xml_path: str): | |
result = {} | |
ovmodel = ie.read_model(model=ov_xml_path) | |
for op in ovmodel.get_ordered_ops(): | |
if "constant" in str(op.get_type_info()).lower() and 'int8_t' in str(op.get_element_type()): | |
sparsity = (op.get_data() == 0).astype(float).mean() | |
result[op.get_friendly_name()] = sparsity | |
with open(Path(ov_xml_path).parent / 'ov_sparsity_stats.json', 'w', encoding='utf-8') as f: | |
json.dump(result, f, indent=2) | |
print('OV model quantized linear layer sparsity:\n', json.dumps(result, indent=2)) | |
def main(model_id: str, save_dir: str, sparsity: float): | |
xml_name = 'openvino_model.xml' | |
if Path(save_dir, xml_name).exists(): | |
return | |
Path(save_dir).mkdir(exist_ok=True, parents=True) | |
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) | |
tokenizer.pad_token = tokenizer.eos_token | |
model = transformers.AutoModelForCausalLM.from_pretrained(model_id) | |
model = sparsify(model, sparsity=sparsity) | |
quantize_and_save(model, tokenizer, save_dir) | |
check_ov_w8a8_ir_sparsity(Path(save_dir, xml_name).as_posix()) | |
print(f'Quantized sparse ov model saved at {save_dir}.') | |
if __name__ == "__main__": | |
# main('facebook/opt-350m', './logs/opt-350m-w8a8-unstructured50/', sparsity=0.5) | |
# main('facebook/opt-350m', './logs/opt-350m-w8a8-unstructured90/', sparsity=0.9) | |
# main('facebook/opt-6.7b', './logs/opt-6.7b-w8a8-unstructured50/', sparsity=0.5) | |
# main('facebook/opt-6.7b', './logs/opt-6.7b-w8a8-unstructured90/', sparsity=0.9) | |
# main('meta-llama/Llama-2-7b-hf', './logs/llama-2-7b-w8a8-unstructured50/', sparsity=0.5) | |
# main('meta-llama/Llama-2-13b-hf', './logs/llama-2-13b-w8a8-unstructured50/', sparsity=0.5) | |
args = HfArgumentParser(Args).parse_args_into_dataclasses()[0] | |
args = typing.cast(Args, args) | |
main(args.model_id, args.save_dir, args.sparsity) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment