Skip to content

Instantly share code, notes, and snippets.

@AlexMikhalev
Last active April 17, 2021 20:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save AlexMikhalev/7257f4adfe57326d248b81bb14ade1f7 to your computer and use it in GitHub Desktop.
Save AlexMikhalev/7257f4adfe57326d248b81bb14ade1f7 to your computer and use it in GitHub Desktop.
QA BERT pre-cashed
tokenizer = None
import numpy as np
import torch
import os
config_switch=os.getenv('DOCKER', 'local')
if config_switch=='local':
startup_nodes = [{"host": "127.0.0.1", "port": "30001"}, {"host": "127.0.0.1", "port":"30002"}, {"host":"127.0.0.1", "port":"30003"}]
else:
startup_nodes = [{"host": "rgcluster", "port": "30001"}, {"host": "rgcluster", "port":"30002"}, {"host":"rgcluster", "port":"30003"}]
try:
from redisai import ClusterClient
redisai_cluster_client = ClusterClient(startup_nodes=startup_nodes)
except:
print("Redis Cluster is not available")
def loadTokeniser():
global tokenizer
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
return tokenizer
def qa(question, sentence_key,hash_tag):
### question is encoded
### use pre-computed context/answer text tensor
global tokenizer
if not tokenizer:
tokenizer=loadTokeniser()
token_key = f"tokenized:bert:qa:{sentence_key}"
input_ids_question = tokenizer.encode(question, add_special_tokens=True, truncation=True, return_tensors="np")
num_seg_a=len(input_ids_question)
num_seg_b=redisai_cluster_client.tensorget(token_key,meta_only=True)['shape'][0]
segment_ids = np.array([0]*num_seg_a + [1]*num_seg_b)
## those two line shall be inside script inside RedisAI
input_ids_context=redisai_cluster_client.tensorget(token_key)
input_ids = np.append(input_ids_question,input_ids_context).astype(np.int16)
print(input_ids.shape)
print(input_ids)
redisai_cluster_client.tensorset(f'input_ids{hash_tag}', input_ids)
# TODO: add torchscript (qa_append) to run numpy append input_ids_question and input_ids_context via torch.cat
redisai_cluster_client.tensorset(f'token_type_ids{hash_tag}', segment_ids)
redisai_cluster_client.modelrun(f'bert-qa{hash_tag}', [f'input_ids{hash_tag}', f'token_type_ids{hash_tag}'],
[f'answer_start_scores{hash_tag}', f'answer_end_scores{hash_tag}'])
print(f"Model run on {hash_tag}")
answer_start_scores = redisai_cluster_client.tensorget(f'answer_start_scores{hash_tag}')
answer_end_scores = redisai_cluster_client.tensorget(f'answer_end_scores{hash_tag}')
answer_start = np.argmax(answer_start_scores)
answer_end = np.argmax(answer_end_scores) + 1
input_ids = inputs["input_ids"].tolist()[0]
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
return answer
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment