AlexMikhalev/export_trace.py

## export_trace.py
from transformers import BertForQuestionAnswering
import torch

bert_name = "bert-large-uncased-whole-word-masking-finetuned-squad"

model = BertForQuestionAnswering.from_pretrained(bert_name, torchscript=True)
model.eval()

inputs = [torch.ones(1, 2, dtype=torch.int64),
          torch.ones(1, 2, dtype=torch.int64),
          torch.ones(1, 2, dtype=torch.int64)]

with torch.no_grad():
    traced_model = torch.jit.trace(model, inputs)

torch.jit.save(traced_model, "traced_bert_qa.pt")

## redis_load.py
import redisai

r = redisai.Client()

model_file = 'traced_bert_qa.pt'

with open(model_file, 'rb') as f:
    model = f.read()

chunk_size = 500 * 1024 * 1024
model_chunks = [model[i:i + chunk_size] for i in range(0, len(model), chunk_size)]

r.modelset('bert-qa', 'TORCH', 'CPU', model)

## redis_predict.py
import redisai
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

r = redisai.Client()

text = r"""
At a very high level, one of the most critical steps in any ML pipeline is called AI serving, a task usually performed by an AI inference engine. The AI inference engine is responsible for the model deployment and performance monitoring steps in the figure above, and represents a whole new world that will eventually determine whether applications can use AI technologies to improve operational efficiencies and solve real business problems.
"""

questions = [
    "What is the most critical step in any ML pipeline?",
    "What is AI serving?",
    "What is an AI inference engine?",
]

for question in questions:
    inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")

    input_ids = inputs['input_ids'].numpy()
    attention_mask = inputs['attention_mask'].numpy()
    token_type_ids = inputs['token_type_ids'].numpy()

    r.tensorset('input_ids', input_ids)
    r.tensorset('attention_mask', attention_mask)
    r.tensorset('token_type_ids', token_type_ids)

    r.modelrun('bert-qa', ['input_ids', 'attention_mask', 'token_type_ids'],
                          ['answer_start_scores', 'answer_end_scores'])

    answer_start_scores = r.tensorget('answer_start_scores')
    answer_end_scores = r.tensorget('answer_end_scores')

    answer_start = np.argmax(answer_start_scores)
    answer_end = np.argmax(answer_end_scores) + 1

    input_ids = inputs["input_ids"].tolist()[0]
    output_tokens = tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])
    answer = tokenizer.convert_tokens_to_string(output_tokens)

    print(f"Question: {question}")
    print(f"Answer: {answer}\n")
	from transformers import BertForQuestionAnswering
	import torch

	bert_name = "bert-large-uncased-whole-word-masking-finetuned-squad"

	model = BertForQuestionAnswering.from_pretrained(bert_name, torchscript=True)
	model.eval()

	inputs = [torch.ones(1, 2, dtype=torch.int64),
	torch.ones(1, 2, dtype=torch.int64),
	torch.ones(1, 2, dtype=torch.int64)]

	with torch.no_grad():
	traced_model = torch.jit.trace(model, inputs)

	torch.jit.save(traced_model, "traced_bert_qa.pt")
	import redisai

	r = redisai.Client()

	model_file = 'traced_bert_qa.pt'

	with open(model_file, 'rb') as f:
	model = f.read()

	chunk_size = 500 * 1024 * 1024
	model_chunks = [model[i:i + chunk_size] for i in range(0, len(model), chunk_size)]

	r.modelset('bert-qa', 'TORCH', 'CPU', model)
	import redisai
	from transformers import AutoTokenizer
	import numpy as np

	tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

	r = redisai.Client()

	text = r"""
	At a very high level, one of the most critical steps in any ML pipeline is called AI serving, a task usually performed by an AI inference engine. The AI inference engine is responsible for the model deployment and performance monitoring steps in the figure above, and represents a whole new world that will eventually determine whether applications can use AI technologies to improve operational efficiencies and solve real business problems.
	"""

	questions = [
	"What is the most critical step in any ML pipeline?",
	"What is AI serving?",
	"What is an AI inference engine?",
	]

	for question in questions:
	inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")

	input_ids = inputs['input_ids'].numpy()
	attention_mask = inputs['attention_mask'].numpy()
	token_type_ids = inputs['token_type_ids'].numpy()

	r.tensorset('input_ids', input_ids)
	r.tensorset('attention_mask', attention_mask)
	r.tensorset('token_type_ids', token_type_ids)

	r.modelrun('bert-qa', ['input_ids', 'attention_mask', 'token_type_ids'],
	['answer_start_scores', 'answer_end_scores'])

	answer_start_scores = r.tensorget('answer_start_scores')
	answer_end_scores = r.tensorget('answer_end_scores')

	answer_start = np.argmax(answer_start_scores)
	answer_end = np.argmax(answer_end_scores) + 1

	input_ids = inputs["input_ids"].tolist()[0]
	output_tokens = tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])
	answer = tokenizer.convert_tokens_to_string(output_tokens)

	print(f"Question: {question}")
	print(f"Answer: {answer}\n")