Created
January 13, 2020 06:11
-
-
Save penut85420/072891df87df021466c8856679fe701d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' | |
import warnings | |
warnings.filterwarnings('ignore') | |
import tensorflow as tf | |
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) | |
import numpy as np | |
import tensorflow_hub as hub | |
from penut.utils import TimeCost | |
from bert.tokenization import FullTokenizer | |
class BertTokenizer: | |
def __init__(self, bert_path, tokenizer_cls, maxlen=512): | |
self.maxlen = maxlen | |
with tf.compat.v1.Session() as sess: | |
bert = hub.Module(bert_path) | |
tk_info = bert(signature='tokenization_info', as_dict=True) | |
tk_info = [tk_info['vocab_file'], tk_info['do_lower_case']] | |
vocab_file, do_lower_case = sess.run(tk_info) | |
self.tokenizer = tokenizer_cls(vocab_file, do_lower_case) | |
def convert_sentences_to_ids(self, sentences): | |
ids = list(map(self.convert_single_sentence_to_ids, sentences)) | |
return np.array(ids) | |
def convert_single_sentence_to_ids(self, sentence): | |
tokens = self.tokenizer.tokenize(sentence) | |
tokens = ['[CLS]'] + tokens + ['[SEP]'] | |
tokens += (self.maxlen - len(tokens)) * ['[PAD]'] | |
return self.tokenizer.convert_tokens_to_ids(tokens) | |
def tokenize_demo(self, sents): | |
segs = list(map(self.tokenizer.tokenize, sents)) | |
return segs | |
def en_bert(): | |
bert_path = 'https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1' | |
demo_sentences = [ | |
'Hello, bert!', | |
'This may be a useful example.', | |
'What about unknown words in tokenization?' | |
] | |
bert_demo(bert_path, demo_sentences) | |
def zh_bert(): | |
bert_path = 'https://tfhub.dev/google/bert_chinese_L-12_H-768_A-12/1' | |
demo_sentences = [ | |
'哈囉,Bert!', | |
'這也許會是個有用的範例。', | |
'如果星爆氣流斬出現在句子裡面呢?', | |
'如果 星爆 氣流 斬 出現 在 句子 裡面 呢 ?', | |
] | |
bert_demo(bert_path, demo_sentences) | |
def multi_bert(): | |
bert_path = 'https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1' | |
demo_sentences = [ | |
'Hello, bert!', | |
'This may be a useful example.', | |
'What about unknown words in tokenization?', | |
'哈囉,Bert!', | |
'這也許會是個有用的範例。', | |
'如果星爆氣流斬出現在句子裡面呢?', | |
'如果 星爆 氣流 斬 出現 在 句子 裡面 呢 ?', | |
] | |
bert_demo(bert_path, demo_sentences) | |
def albert(): | |
bert_path = 'https://tfhub.dev/google/albert_xxlarge/3' | |
demo_sentences = [ | |
'Hello, bert!', | |
'This may be a useful example.', | |
'What about unknown words in tokenization?', | |
'哈囉,Bert!', | |
'這也許會是個有用的範例。', | |
'如果星爆氣流斬出現在句子裡面呢?', | |
'如果 星爆 氣流 斬 出現 在 句子 裡面 呢 ?', | |
] | |
bert_demo(bert_path, demo_sentences) | |
def bert_demo(bert_path, demo_sents): | |
with TimeCost('Get Tokenizer Time Cost'): | |
bt = BertTokenizer(bert_path, FullTokenizer, 20) | |
print(f'Tokenization Demo: {bt.tokenize_demo(demo_sents)}') | |
print(f'ID Shape: {bt.convert_sentences_to_ids(demo_sents).shape}\n') | |
if __name__ == "__main__": | |
# en_bert() | |
# zh_bert() | |
# multi_bert() | |
albert() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment