Created
March 22, 2021 07:46
-
-
Save AlexMikhalev/5a2f1e1b9bf11d4cd69fba9100993798 to your computer and use it in GitHub Desktop.
Tokenizer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import AutoTokenizer, AutoModel | |
tokenizer = None | |
def loadTokeniser(): | |
global tokenizer | |
from transformers import AutoTokenizer | |
tokenizer = AutoTokenizer.from_pretrained("t5-base",torchscript=True) | |
# Try RobertaTokenizerFast and BART | |
# tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT") | |
return tokenizer | |
def remove_prefix(text, prefix): | |
return text[text.startswith(prefix) and len(prefix):] | |
def parse_sentence(record): | |
import torch | |
import numpy as np | |
global tokenizer | |
if not tokenizer: | |
tokenizer=loadTokeniser() | |
article_text=[] | |
for _, value in sorted(record['value'].items(), key=lambda item: int(item[0])): | |
article_text.append(value) | |
inputs = tokenizer.encode_plus(" ".join(article_text),max_length=1000,add_special_tokens=True,return_tensors="pt") | |
input_ids = inputs['input_ids'].numpy() | |
attention_mask = inputs['attention_mask'].numpy() | |
token_type_ids = inputs['token_type_ids'].numpy() | |
key_prefix='sentence:' | |
article_key=remove_prefix(record['key'],key_prefix) | |
token_key = f"tokenized:T5:sum:{article_key}" | |
execute('AI.TENSORSET', token_key+":inputs_ids", input_ids) | |
execute('SADD','processed_docs_stage3_sum', token_key) | |
gb = GB() | |
gb.foreach(parse_sentence) | |
gb.count() | |
gb.run('sentence:*') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment