Skip to content

Instantly share code, notes, and snippets.

@penut85420
Created January 13, 2020 06:11
Show Gist options
  • Save penut85420/072891df87df021466c8856679fe701d to your computer and use it in GitHub Desktop.
Save penut85420/072891df87df021466c8856679fe701d to your computer and use it in GitHub Desktop.
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
import numpy as np
import tensorflow_hub as hub
from penut.utils import TimeCost
from bert.tokenization import FullTokenizer
class BertTokenizer:
def __init__(self, bert_path, tokenizer_cls, maxlen=512):
self.maxlen = maxlen
with tf.compat.v1.Session() as sess:
bert = hub.Module(bert_path)
tk_info = bert(signature='tokenization_info', as_dict=True)
tk_info = [tk_info['vocab_file'], tk_info['do_lower_case']]
vocab_file, do_lower_case = sess.run(tk_info)
self.tokenizer = tokenizer_cls(vocab_file, do_lower_case)
def convert_sentences_to_ids(self, sentences):
ids = list(map(self.convert_single_sentence_to_ids, sentences))
return np.array(ids)
def convert_single_sentence_to_ids(self, sentence):
tokens = self.tokenizer.tokenize(sentence)
tokens = ['[CLS]'] + tokens + ['[SEP]']
tokens += (self.maxlen - len(tokens)) * ['[PAD]']
return self.tokenizer.convert_tokens_to_ids(tokens)
def tokenize_demo(self, sents):
segs = list(map(self.tokenizer.tokenize, sents))
return segs
def en_bert():
bert_path = 'https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1'
demo_sentences = [
'Hello, bert!',
'This may be a useful example.',
'What about unknown words in tokenization?'
]
bert_demo(bert_path, demo_sentences)
def zh_bert():
bert_path = 'https://tfhub.dev/google/bert_chinese_L-12_H-768_A-12/1'
demo_sentences = [
'哈囉,Bert!',
'這也許會是個有用的範例。',
'如果星爆氣流斬出現在句子裡面呢?',
'如果 星爆 氣流 斬 出現 在 句子 裡面 呢 ?',
]
bert_demo(bert_path, demo_sentences)
def multi_bert():
bert_path = 'https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1'
demo_sentences = [
'Hello, bert!',
'This may be a useful example.',
'What about unknown words in tokenization?',
'哈囉,Bert!',
'這也許會是個有用的範例。',
'如果星爆氣流斬出現在句子裡面呢?',
'如果 星爆 氣流 斬 出現 在 句子 裡面 呢 ?',
]
bert_demo(bert_path, demo_sentences)
def albert():
bert_path = 'https://tfhub.dev/google/albert_xxlarge/3'
demo_sentences = [
'Hello, bert!',
'This may be a useful example.',
'What about unknown words in tokenization?',
'哈囉,Bert!',
'這也許會是個有用的範例。',
'如果星爆氣流斬出現在句子裡面呢?',
'如果 星爆 氣流 斬 出現 在 句子 裡面 呢 ?',
]
bert_demo(bert_path, demo_sentences)
def bert_demo(bert_path, demo_sents):
with TimeCost('Get Tokenizer Time Cost'):
bt = BertTokenizer(bert_path, FullTokenizer, 20)
print(f'Tokenization Demo: {bt.tokenize_demo(demo_sents)}')
print(f'ID Shape: {bt.convert_sentences_to_ids(demo_sents).shape}\n')
if __name__ == "__main__":
# en_bert()
# zh_bert()
# multi_bert()
albert()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment