Skip to content

Instantly share code, notes, and snippets.

@AlexMikhalev
Created March 22, 2021 07:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save AlexMikhalev/5a2f1e1b9bf11d4cd69fba9100993798 to your computer and use it in GitHub Desktop.
Save AlexMikhalev/5a2f1e1b9bf11d4cd69fba9100993798 to your computer and use it in GitHub Desktop.
Tokenizer
from transformers import AutoTokenizer, AutoModel
tokenizer = None
def loadTokeniser():
global tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-base",torchscript=True)
# Try RobertaTokenizerFast and BART
# tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
return tokenizer
def remove_prefix(text, prefix):
return text[text.startswith(prefix) and len(prefix):]
def parse_sentence(record):
import torch
import numpy as np
global tokenizer
if not tokenizer:
tokenizer=loadTokeniser()
article_text=[]
for _, value in sorted(record['value'].items(), key=lambda item: int(item[0])):
article_text.append(value)
inputs = tokenizer.encode_plus(" ".join(article_text),max_length=1000,add_special_tokens=True,return_tensors="pt")
input_ids = inputs['input_ids'].numpy()
attention_mask = inputs['attention_mask'].numpy()
token_type_ids = inputs['token_type_ids'].numpy()
key_prefix='sentence:'
article_key=remove_prefix(record['key'],key_prefix)
token_key = f"tokenized:T5:sum:{article_key}"
execute('AI.TENSORSET', token_key+":inputs_ids", input_ids)
execute('SADD','processed_docs_stage3_sum', token_key)
gb = GB()
gb.foreach(parse_sentence)
gb.count()
gb.run('sentence:*')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment