Created
February 15, 2020 13:57
-
-
Save henry16lin/f5b3c96e8c51d8cef0d171f16bad26ec to your computer and use it in GitHub Desktop.
create_token
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# get pre-train tokenizer | |
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME) | |
vocab = tokenizer.vocab | |
print("dict size", len(vocab)) | |
# see some token and index mapping | |
import random | |
random_tokens = random.sample(list(vocab), 10) | |
random_ids = [vocab[t] for t in random_tokens] | |
print("{0:20}{1:15}".format("token", "index")) | |
print("-" * 25) | |
for t, id in zip(random_tokens, random_ids): #隨便看幾個字 | |
print("{0:15}{1:10}".format(t, id)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment