usstq/ovcausalllm.py

## ovcausalllm.py
from transformers import AutoTokenizer, AutoModelForCausalLM
from optimum.intel import OVModelForCausalLM
import time
import argparse
import os, sys
import numpy as np
import torch
import hashlib
import itertools

from typing import Dict, Optional, Tuple, Union
import ctypes

# beam search zero-copy WA
class OVModelForCausalLM_opt(OVModelForCausalLM):
    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
        time1 = time.time()
        if self.time0 is not None:
            self.latency.append(time1 - self.time0)
        self.time0 = time1
        return super().prepare_inputs_for_generation(input_ids, past_key_values, **kwargs)

    def begin_latency_record(self):
        self.time0 = None
        self.latency = []

    def latency_summary(self):
        N = len(self.latency)
        if N == 0:
            return "?"
        total_2ndTok = 0
        for i in range(1, N):
            total_2ndTok += self.latency[i]
        return f"{self.latency[0]*1e3:.1f}ms+({total_2ndTok/(N-1)*1e3:.1f} ms)x{N}"

    def setup(self):
        # setup common memory area
        self.beam_idx = (ctypes.c_int * 2048)(0)
        os.environ['beam_idx_addr'] = str(hex(ctypes.addressof(self.beam_idx)))

    def clear_beam_idx(self):
        self.beam_idx[0] = 0

    def _reorder_cache(
        self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
    ) -> Tuple[Tuple[torch.Tensor]]:
        """
        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
        [`~PreTrainedModel.beam_sample`] is called.
        This is required to match `past_key_values` with the correct beam_idx at every generation step.
        """

        beam_idx_1d = torch.flatten(beam_idx)
        #print("======beam_idx_1d=", beam_idx_1d)
        self.beam_idx[0] = beam_idx_1d.shape[0]
        for i in range(beam_idx_1d.shape[0]):
            self.beam_idx[i + 1] = beam_idx_1d[i]
        return past_key_values

        # print("beam_idx : ", beam_idx)

        if self.config.model_type == "bloom":
            return self._reorder_cache_bloom(past_key_values, beam_idx)

        # from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel._reorder_cache
        return tuple(
            tuple(np.take(past_state, beam_idx, 0) for past_state in layer_past) for layer_past in past_key_values
        )

parser = argparse.ArgumentParser(prog="ovcausalllm")
parser.add_argument('model_id', default="bigscience/bloomz-560m", help =
                    '''
                    EleutherAI/pythia-70m
                    databricks/dolly-v2-2-8b
                    huggyllama/llama-7b
                    bigscience/bloomz-560m
                    EleutherAI/gpt-j-6B
                    ''')
long_prompt='''The 1973 oil crisis began in October 1973 when the members of the Organization of Arab Petroleum Exporting Countries (OAPEC, consisting of the Arab members of OPEC plus Egypt and Syria) proclaimed an oil embargo. By the end of the embargo in March 1974, the price of oil had risen from US$3 per barrel to nearly $12 globally; US prices were significantly higher. The embargo caused an oil crisis, or "shock", with many short- and long-term effects on global politics and the global economy. It was later called the "first oil shock", followed by the 1979 oil crisis, termed the "second oil shock". So, who proclaimed the oil embargo?'''
long_prompt*=100
#long_prompt = "What is a one-sentence summary of the following article? You could go directly into the confessional (provided there's no one else in there or waiting outside), but sometimes it's nice to take a minute in the pew by yourself beforehand. You have this beautiful church probably almost all to yourself. Can you feel its energy resonating through you? Can you feel the majesty of the Lord's kingdom and how you're a part of it? Take a moment to kneel and pray with your head down and hands clasped together. Reflect on your faith and how you feel currently. Think about how you've been responding to God's call and how you've been living in the light of his love. When the priest is ready for you, of course. You'll probably see him there by his lonesome or someone else walk out just before you. Sit down either across from him or behind the screen -- it's totally up to you whether or not you prefer to remain anonymous. He won't treat you any differently either way. Make the sign of the cross upon his prompt, saying, \"Bless me, Father, for I have sinned. It has been (blank) since my last confession.\" This is your standard, traditional phrasing. However, if you just sit down and say hello, that's fine, too. The priest knows what he's doing. The Byzantine Rite is a bit different. The priest may sit to your side and put his epitrachelion on your head. He may then also do the Prayer of Absolution. But the idea remains the exact same -- just go wherever he takes you. Once you sit down and you've made the sign of the cross, just sit back and follow the priest's lead. He'll ask you how long it's been since your last confession (if you don't voluntarily offer that information), how you are feeling, maybe how your faith is going, and then ask you what sins you would like to talk about with him and God. It's just a casual conversation! Do not fret. There is absolutely zero pressure on your part. Again, as long as you come there with the intention of leaving with a clean heart, you're more than welcome in the church. There is no wrong way to go about confession! This part is intimidating, but think about it this way: the priest you're talking to has probably heard just about everything before. Whatever you have to say will not blow his mind. So when he asks, start rattling them off, from the most serious to the least. If he asks any questions, answer them, but do not feel the need to go into detail. A simple, \"I did so and so,\" will suffice. Your priest is going to be very understanding. If you don't remember the exact timeframe, that's fine. If you don't remember your motivation, that's fine. All your priest cares about is that you're being as honest as possible and that your heart is in the right place. He'll talk you through everything, possibly asking about your intentions, but mainly just letting you know that God loves you, sin and all. If he has any ideas to bring you closer to God, he may suggest them at this juncture. He's there to help, after all. He will then ask you to make an Act of Contrition. That goes like this: My God, I am sorry for my sins with all my heart.In choosing to do wrong and failing to do good,I have sinned against You whom I should loveabove all things. I firmly intend, with your help,to do penance, to sin no more, andto avoid whatever leads me to sin.Our Savior Jesus Christ suffered and died for us.In his name, my God, have mercy (If you are a Roman Catholic, your act of contrition will go like this: Oh my God, I am very sorry for having offended thee. I detest all of my sins because of thy just punishment. But most of all, because they offend you, my God, who is all good and deserving of all my love. I firmly resolve with the help of thy grace, to sin no more, and to avoid the near occasion of sin. Amen. Don't worry! It won't be anything huge. You may even walk away just having to say a few meaningful prayers. Take the absolution to heart -- you now have a brand new, clean slate to work with. It'll feel so uplifting! Just to clarify, \"absolution\" means your sins are washed away. \"Penance\" is your expression of regret and repentance, showing God that you're truly sorry for what you've done and that you wish for nothing more than to be forgiven. Summary:"

parser.add_argument("-t", "--torch", action="store_true")
parser.add_argument("-r", "--repeat", type=int, default=1000)
parser.add_argument("-p", "--prompt", nargs='+', type=str, default=[long_prompt])
parser.add_argument("-l0", nargs='+', type=int,  default=[0])
parser.add_argument("-l", "--len", nargs='+', type=int, default=[100])
parser.add_argument("-b", "--batch", type=int, default=1)
parser.add_argument("--export", action="store_true")
parser.add_argument("--bf16", action="store_true")
parser.add_argument("-v", "--verbose", action="store_true")
parser.add_argument("--top_p", type=float, default=0)
parser.add_argument("--top_k", type=int, default=0)
parser.add_argument("--penalty_alpha", type=float, default=0)   # default penalty_alpha==0 means degenrate to top-k
parser.add_argument("-nb",'--num_beams', type=int, default=1)

args = parser.parse_args()

generate_kwargs = {}
generate_kwargs["do_sample"] = False
if args.top_k > 0:
    generate_kwargs["top_k"] = args.top_k
    generate_kwargs["do_sample"] = True
if args.penalty_alpha > 0:
    generate_kwargs["penalty_alpha"] = args.penalty_alpha
    generate_kwargs.pop("do_sample")
if args.top_p > 0:
    generate_kwargs["top_p"] = args.top_p
    generate_kwargs["top_k"] = 0
    generate_kwargs["do_sample"] = True


if args.batch > 1:
    if len(args.prompt) == 1:
        args.prompt *= args.batch
    if len(args.prompt) != args.batch:
        raise "prompt & batch inconsistent!"

model_id = args.model_id

if args.export:
    if (model_id[-1] == '/'):
        model_id = model_id[:-1]
    ov_pretrained_model_path = f"/home/tingqian/models/ov-{model_id.split('/')[-1]}"
    print(f"load pretrained & export & save to {ov_pretrained_model_path}")
    tokenizer = AutoTokenizer.from_pretrained(model_id, proxies=proxies)
    model = OVModelForCausalLM.from_pretrained(model_id, export=True, proxies=proxies)
    model.save_pretrained(ov_pretrained_model_path)
    tokenizer.save_pretrained(ov_pretrained_model_path)
    os.exit()


proxies = {
    'http': os.environ['http_proxy'],
    'https': os.environ['https_proxy']
}

print(f"/**************  {model_id} ******************/")


ov_config={"PERFORMANCE_HINT": "LATENCY",
           "INFERENCE_PRECISION_HINT" : "bf16" if args.bf16 else "f32",
           "CPU_DENORMALS_OPTIMIZATION" : "YES",
           "CACHE_DIR" : None}

torch_dtype = torch.bfloat16 if args.bf16 else torch.float32

#ov_config = None
if args.torch:
    print(f"load pretrained pytorch model from {model_id}")
    print(f"use precision {torch_dtype}")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id)
    model = model.to(dtype=torch_dtype)

    from torch.fx import symbolic_trace
    symbolic_traced : torch.fx.GraphModule = symbolic_trace(model)
    print(symbolic_traced.graph)
    import sys
    sys.exit()
else:
    print(f"load pretrained ov model from {model_id}")
    print(f"ov_config={ov_config}")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = OVModelForCausalLM_opt.from_pretrained(model_id, ov_config=ov_config)
    model.setup()
# https://stackoverflow.com/questions/70544129/transformers-asking-to-pad-but-the-tokenizer-does-not-have-a-padding-token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

ref_result = ''

for question_len, answer_len in itertools.product(args.l0, args.len):
    for i in range(args.repeat):
        if question_len > 0:
            inputs = tokenizer(args.prompt, return_tensors="pt", max_length = question_len, pad_to_max_length = True, return_token_type_ids=False)
        else:
            inputs = tokenizer(args.prompt, return_tensors="pt", padding=True, return_token_type_ids=False)
        #print(inputs)
        #inputs.pop("token_type_ids", None)

        actual_question_len = inputs.input_ids.size(1)
        max_total_len = actual_question_len + answer_len

        generate_kwargs["min_length"] = max_total_len
        generate_kwargs["max_length"] = max_total_len
        generate_kwargs["temperature"] = 0.9
        generate_kwargs["pad_token_id"] = tokenizer.eos_token_id
        if args.num_beams > 1:
            generate_kwargs["num_beams"] = args.num_beams
        # print(f"inputs.input_ids.shape={inputs.input_ids.shape} question_len={question_len} actual_question_len={actual_question_len} answer_len={answer_len}")
        # print(f"max_total_len : {max_total_len}")
        t0 = time.time()
        model.clear_beam_idx()
        model.begin_latency_record()
        gen_tokens = model.generate(**inputs, **generate_kwargs)
        t1 = time.time()

        time.sleep(1)
        result = tokenizer.batch_decode(gen_tokens)

        result_str = ';;;'.join(result).encode('utf-8')
        md5sum = hashlib.md5(result_str).hexdigest()

        if ref_result != result_str:
            ref_result = result_str
            print(f"\n*** Text generated: (with md5sum {md5sum}) ***")
            for rid, r in enumerate(result):
                if args.verbose:
                    print(f"[{rid}] : {r}")
                else:
                    if len(r) > 160:
                        r = r[:160] + "..."
                    print(f"[{rid}] : {[r]}")

        print(f"round {i}:  {actual_question_len}+{answer_len}={len(gen_tokens[0])} tokens {t1-t0:.2f} sec. Per-token latency:  {model.latency_summary()} ")
	from transformers import AutoTokenizer, AutoModelForCausalLM
	from optimum.intel import OVModelForCausalLM
	import time
	import argparse
	import os, sys
	import numpy as np
	import torch
	import hashlib
	import itertools

	from typing import Dict, Optional, Tuple, Union
	import ctypes

	# beam search zero-copy WA
	class OVModelForCausalLM_opt(OVModelForCausalLM):
	def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
	time1 = time.time()
	if self.time0 is not None:
	self.latency.append(time1 - self.time0)
	self.time0 = time1
	return super().prepare_inputs_for_generation(input_ids, past_key_values, **kwargs)

	def begin_latency_record(self):
	self.time0 = None
	self.latency = []

	def latency_summary(self):
	N = len(self.latency)
	if N == 0:
	return "?"
	total_2ndTok = 0
	for i in range(1, N):
	total_2ndTok += self.latency[i]
	return f"{self.latency[0]1e3:.1f}ms+({total_2ndTok/(N-1)1e3:.1f} ms)x{N}"

	def setup(self):
	# setup common memory area
	self.beam_idx = (ctypes.c_int * 2048)(0)
	os.environ['beam_idx_addr'] = str(hex(ctypes.addressof(self.beam_idx)))

	def clear_beam_idx(self):
	self.beam_idx[0] = 0

	def _reorder_cache(
	self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
	) -> Tuple[Tuple[torch.Tensor]]:
	"""
	This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
	[`~PreTrainedModel.beam_sample`] is called.
	This is required to match `past_key_values` with the correct beam_idx at every generation step.
	"""

	beam_idx_1d = torch.flatten(beam_idx)
	#print("======beam_idx_1d=", beam_idx_1d)
	self.beam_idx[0] = beam_idx_1d.shape[0]
	for i in range(beam_idx_1d.shape[0]):
	self.beam_idx[i + 1] = beam_idx_1d[i]
	return past_key_values

	# print("beam_idx : ", beam_idx)

	if self.config.model_type == "bloom":
	return self._reorder_cache_bloom(past_key_values, beam_idx)

	# from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel._reorder_cache
	return tuple(
	tuple(np.take(past_state, beam_idx, 0) for past_state in layer_past) for layer_past in past_key_values
	)

	parser = argparse.ArgumentParser(prog="ovcausalllm")
	parser.add_argument('model_id', default="bigscience/bloomz-560m", help =
	'''
	EleutherAI/pythia-70m
	databricks/dolly-v2-2-8b
	huggyllama/llama-7b
	bigscience/bloomz-560m
	EleutherAI/gpt-j-6B
	''')
	long_prompt='''The 1973 oil crisis began in October 1973 when the members of the Organization of Arab Petroleum Exporting Countries (OAPEC, consisting of the Arab members of OPEC plus Egypt and Syria) proclaimed an oil embargo. By the end of the embargo in March 1974, the price of oil had risen from US$3 per barrel to nearly $12 globally; US prices were significantly higher. The embargo caused an oil crisis, or "shock", with many short- and long-term effects on global politics and the global economy. It was later called the "first oil shock", followed by the 1979 oil crisis, termed the "second oil shock". So, who proclaimed the oil embargo?'''
	long_prompt*=100
	#long_prompt = "What is a one-sentence summary of the following article? You could go directly into the confessional (provided there's no one else in there or waiting outside), but sometimes it's nice to take a minute in the pew by yourself beforehand. You have this beautiful church probably almost all to yourself. Can you feel its energy resonating through you? Can you feel the majesty of the Lord's kingdom and how you're a part of it? Take a moment to kneel and pray with your head down and hands clasped together. Reflect on your faith and how you feel currently. Think about how you've been responding to God's call and how you've been living in the light of his love. When the priest is ready for you, of course. You'll probably see him there by his lonesome or someone else walk out just before you. Sit down either across from him or behind the screen -- it's totally up to you whether or not you prefer to remain anonymous. He won't treat you any differently either way. Make the sign of the cross upon his prompt, saying, \"Bless me, Father, for I have sinned. It has been (blank) since my last confession.\" This is your standard, traditional phrasing. However, if you just sit down and say hello, that's fine, too. The priest knows what he's doing. The Byzantine Rite is a bit different. The priest may sit to your side and put his epitrachelion on your head. He may then also do the Prayer of Absolution. But the idea remains the exact same -- just go wherever he takes you. Once you sit down and you've made the sign of the cross, just sit back and follow the priest's lead. He'll ask you how long it's been since your last confession (if you don't voluntarily offer that information), how you are feeling, maybe how your faith is going, and then ask you what sins you would like to talk about with him and God. It's just a casual conversation! Do not fret. There is absolutely zero pressure on your part. Again, as long as you come there with the intention of leaving with a clean heart, you're more than welcome in the church. There is no wrong way to go about confession! This part is intimidating, but think about it this way: the priest you're talking to has probably heard just about everything before. Whatever you have to say will not blow his mind. So when he asks, start rattling them off, from the most serious to the least. If he asks any questions, answer them, but do not feel the need to go into detail. A simple, \"I did so and so,\" will suffice. Your priest is going to be very understanding. If you don't remember the exact timeframe, that's fine. If you don't remember your motivation, that's fine. All your priest cares about is that you're being as honest as possible and that your heart is in the right place. He'll talk you through everything, possibly asking about your intentions, but mainly just letting you know that God loves you, sin and all. If he has any ideas to bring you closer to God, he may suggest them at this juncture. He's there to help, after all. He will then ask you to make an Act of Contrition. That goes like this: My God, I am sorry for my sins with all my heart.In choosing to do wrong and failing to do good,I have sinned against You whom I should loveabove all things. I firmly intend, with your help,to do penance, to sin no more, andto avoid whatever leads me to sin.Our Savior Jesus Christ suffered and died for us.In his name, my God, have mercy (If you are a Roman Catholic, your act of contrition will go like this: Oh my God, I am very sorry for having offended thee. I detest all of my sins because of thy just punishment. But most of all, because they offend you, my God, who is all good and deserving of all my love. I firmly resolve with the help of thy grace, to sin no more, and to avoid the near occasion of sin. Amen. Don't worry! It won't be anything huge. You may even walk away just having to say a few meaningful prayers. Take the absolution to heart -- you now have a brand new, clean slate to work with. It'll feel so uplifting! Just to clarify, \"absolution\" means your sins are washed away. \"Penance\" is your expression of regret and repentance, showing God that you're truly sorry for what you've done and that you wish for nothing more than to be forgiven. Summary:"

	parser.add_argument("-t", "--torch", action="store_true")
	parser.add_argument("-r", "--repeat", type=int, default=1000)
	parser.add_argument("-p", "--prompt", nargs='+', type=str, default=[long_prompt])
	parser.add_argument("-l0", nargs='+', type=int, default=[0])
	parser.add_argument("-l", "--len", nargs='+', type=int, default=[100])
	parser.add_argument("-b", "--batch", type=int, default=1)
	parser.add_argument("--export", action="store_true")
	parser.add_argument("--bf16", action="store_true")
	parser.add_argument("-v", "--verbose", action="store_true")
	parser.add_argument("--top_p", type=float, default=0)
	parser.add_argument("--top_k", type=int, default=0)
	parser.add_argument("--penalty_alpha", type=float, default=0) # default penalty_alpha==0 means degenrate to top-k
	parser.add_argument("-nb",'--num_beams', type=int, default=1)

	args = parser.parse_args()

	generate_kwargs = {}
	generate_kwargs["do_sample"] = False
	if args.top_k > 0:
	generate_kwargs["top_k"] = args.top_k
	generate_kwargs["do_sample"] = True
	if args.penalty_alpha > 0:
	generate_kwargs["penalty_alpha"] = args.penalty_alpha
	generate_kwargs.pop("do_sample")
	if args.top_p > 0:
	generate_kwargs["top_p"] = args.top_p
	generate_kwargs["top_k"] = 0
	generate_kwargs["do_sample"] = True


	if args.batch > 1:
	if len(args.prompt) == 1:
	args.prompt *= args.batch
	if len(args.prompt) != args.batch:
	raise "prompt & batch inconsistent!"

	model_id = args.model_id

	if args.export:
	if (model_id[-1] == '/'):
	model_id = model_id[:-1]
	ov_pretrained_model_path = f"/home/tingqian/models/ov-{model_id.split('/')[-1]}"
	print(f"load pretrained & export & save to {ov_pretrained_model_path}")
	tokenizer = AutoTokenizer.from_pretrained(model_id, proxies=proxies)
	model = OVModelForCausalLM.from_pretrained(model_id, export=True, proxies=proxies)
	model.save_pretrained(ov_pretrained_model_path)
	tokenizer.save_pretrained(ov_pretrained_model_path)
	os.exit()


	proxies = {
	'http': os.environ['http_proxy'],
	'https': os.environ['https_proxy']
	}

	print(f"/************ {model_id} ****************/")


	ov_config={"PERFORMANCE_HINT": "LATENCY",
	"INFERENCE_PRECISION_HINT" : "bf16" if args.bf16 else "f32",
	"CPU_DENORMALS_OPTIMIZATION" : "YES",
	"CACHE_DIR" : None}

	torch_dtype = torch.bfloat16 if args.bf16 else torch.float32

	#ov_config = None
	if args.torch:
	print(f"load pretrained pytorch model from {model_id}")
	print(f"use precision {torch_dtype}")
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(model_id)
	model = model.to(dtype=torch_dtype)

	from torch.fx import symbolic_trace
	symbolic_traced : torch.fx.GraphModule = symbolic_trace(model)
	print(symbolic_traced.graph)
	import sys
	sys.exit()
	else:
	print(f"load pretrained ov model from {model_id}")
	print(f"ov_config={ov_config}")
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = OVModelForCausalLM_opt.from_pretrained(model_id, ov_config=ov_config)
	model.setup()
	# https://stackoverflow.com/questions/70544129/transformers-asking-to-pad-but-the-tokenizer-does-not-have-a-padding-token
	if tokenizer.pad_token is None:
	tokenizer.add_special_tokens({'pad_token': '[PAD]'})

	ref_result = ''

	for question_len, answer_len in itertools.product(args.l0, args.len):
	for i in range(args.repeat):
	if question_len > 0:
	inputs = tokenizer(args.prompt, return_tensors="pt", max_length = question_len, pad_to_max_length = True, return_token_type_ids=False)
	else:
	inputs = tokenizer(args.prompt, return_tensors="pt", padding=True, return_token_type_ids=False)
	#print(inputs)
	#inputs.pop("token_type_ids", None)

	actual_question_len = inputs.input_ids.size(1)
	max_total_len = actual_question_len + answer_len

	generate_kwargs["min_length"] = max_total_len
	generate_kwargs["max_length"] = max_total_len
	generate_kwargs["temperature"] = 0.9
	generate_kwargs["pad_token_id"] = tokenizer.eos_token_id
	if args.num_beams > 1:
	generate_kwargs["num_beams"] = args.num_beams
	# print(f"inputs.input_ids.shape={inputs.input_ids.shape} question_len={question_len} actual_question_len={actual_question_len} answer_len={answer_len}")
	# print(f"max_total_len : {max_total_len}")
	t0 = time.time()
	model.clear_beam_idx()
	model.begin_latency_record()
	gen_tokens = model.generate(inputs, generate_kwargs)
	t1 = time.time()

	time.sleep(1)
	result = tokenizer.batch_decode(gen_tokens)

	result_str = ';;;'.join(result).encode('utf-8')
	md5sum = hashlib.md5(result_str).hexdigest()

	if ref_result != result_str:
	ref_result = result_str
	print(f"\n* Text generated: (with md5sum {md5sum}) *")
	for rid, r in enumerate(result):
	if args.verbose:
	print(f"[{rid}] : {r}")
	else:
	if len(r) > 160:
	r = r[:160] + "..."
	print(f"[{rid}] : {[r]}")

	print(f"round {i}: {actual_question_len}+{answer_len}={len(gen_tokens[0])} tokens {t1-t0:.2f} sec. Per-token latency: {model.latency_summary()} ")