Skip to content

Instantly share code, notes, and snippets.

@lovit
Created August 27, 2020 22:28
Show Gist options
  • Save lovit/e11c57877aae4286ade4c203d6c26a32 to your computer and use it in GitHub Desktop.
Save lovit/e11c57877aae4286ade4c203d6c26a32 to your computer and use it in GitHub Desktop.
Hugging Face tokenizers usage
import tokenizers
tokenizers.__version__
'0.8.1'
from tokenizers import (ByteLevelBPETokenizer,
                        CharBPETokenizer,
                        SentencePieceBPETokenizer,
                        BertWordPieceTokenizer)

small_corpus = 'very_small_corpus.txt'

Bert WordPiece Tokenizer

bert_wordpiece_tokenizer = BertWordPieceTokenizer()
bert_wordpiece_tokenizer.train(
    files = [small_corpus],
    vocab_size = 10,
    min_frequency = 1,
    limit_alphabet = 1000,
    initial_alphabet = [],
    special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    show_progress = True,
    wordpieces_prefix = "##",
)
vocab = bert_wordpiece_tokenizer.get_vocab()
print(sorted(vocab, key=lambda x: vocab[x]))
['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', 'a', 'b', 'c', 'd', 'e', 'f', '##b', '##c', '##d', '##e', '##f']
encoding = bert_wordpiece_tokenizer.encode('ABCDE')
print(encoding.tokens)
print(encoding.ids)
['a', '##b', '##c', '##d', '##e']
[5, 11, 12, 13, 14]
bert_wordpiece_tokenizer.train(
    files = [small_corpus],
    vocab_size = 20,
    min_frequency = 1,
    initial_alphabet = ['g'],
)
vocab = bert_wordpiece_tokenizer.get_vocab()
print(sorted(vocab, key=lambda x: vocab[x]))
['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', 'a', 'b', 'c', 'd', 'e', 'f', 'g', '##b', '##d', '##c', '##e', '##f', 'ab', 'abc', 'ac']
encodings = bert_wordpiece_tokenizer.encode_batch(['ABCDE', 'abcd'])
print(encodings[0].tokens)
['abc', '##d', '##e']
bert_wordpiece_tokenizer.save_model(
    directory = './',
    name = 'very_small_bertwordpiece'
)
# ['./very_small_bertwordpiece-vocab.txt']
['./very_small_bertwordpiece-vocab.txt']
bert_wordpiece_tokenizer = BertWordPieceTokenizer(
    vocab_file = './very_small_bertwordpiece-vocab.txt'
)
bert_wordpiece_tokenizer.encode('ABCDE').tokens
# ['[CLS]', 'abc', '##d', '##e', '[SEP]']
['[CLS]', 'abc', '##d', '##e', '[SEP]']
bert_wordpiece_tokenizer.encode('ABCDE', add_special_tokens=False).tokens
['abc', '##d', '##e']
bert_wordpiece_tokenizer.encode(
    sequence = 'abcde',
    pair = 'abcd'
).tokens
['[CLS]', 'abc', '##d', '##e', '[SEP]', 'abc', '##d', '[SEP]']
bert_wordpiece_tokenizer.add_tokens(['lovit'])
vocab = bert_wordpiece_tokenizer.get_vocab()
print(sorted(vocab, key=lambda x: vocab[x]))
['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', 'a', 'b', 'c', 'd', 'e', 'f', 'g', '##b', '##d', '##c', '##e', '##f', 'ab', 'abc', 'ac', 'lovit']
bert_wordpiece_tokenizer.encode('ABCDE abg lovit').tokens
['[CLS]', 'abc', '##d', '##e', '[UNK]', 'lovit', '[SEP]']
# 지금은 저장 안됨
bert_wordpiece_tokenizer = BertWordPieceTokenizer(
    vocab_file = './very_small_bertwordpiece-vocab.txt'
)
vocab = bert_wordpiece_tokenizer.get_vocab()
print(sorted(vocab, key=lambda x: vocab[x]))
['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', 'a', 'b', 'c', 'd', 'e', 'f', 'g', '##b', '##d', '##c', '##e', '##f', 'ab', 'abc', 'ac']

SentencePiece BPE Tokenizer

sentencepiece_tokenizer = SentencePieceBPETokenizer(
    add_prefix_space = True,
)
sentencepiece_tokenizer.train(
    files = [small_corpus],
    vocab_size = 20,
    min_frequency = 1,
    special_tokens = ['<unk>'],
)
vocab = sentencepiece_tokenizer.get_vocab()
print(sorted(vocab, key=lambda x: vocab[x]))
['<unk>', 'A', 'B', 'C', 'D', 'E', 'F', '▁', '▁A', '▁AB', '▁ABC', 'DE', '▁DE', '▁AC', '▁AF', '▁ABD', '▁ABCDE']
sentencepiece_tokenizer = SentencePieceBPETokenizer(
    add_prefix_space = False
)
sentencepiece_tokenizer.train(
    files = [small_corpus],
    vocab_size = 20,
    min_frequency = 1,
    special_tokens = ['<unk>', 'lovit'],
)
vocab = sentencepiece_tokenizer.get_vocab()
print(sorted(vocab, key=lambda x: vocab[x]))
['<unk>', 'lovit', 'A', 'B', 'C', 'D', 'E', 'F', '▁', '▁A', '▁AB', 'DE', '▁ABC', 'AB', 'CDE', '▁AC', '▁AF', '▁ABD', 'ABCDE']
sentencepiece_tokenizer.save_model('./', 'very_small_sentencepiece')
['./very_small_sentencepiece-vocab.json',
 './very_small_sentencepiece-merges.txt']
sentencepiece_tokenizer = SentencePieceBPETokenizer(
    vocab_file = './very_small_sentencepiece-vocab.json',
    merges_file = './very_small_sentencepiece-merges.txt'
)
sentencepiece_tokenizer.encode('ABCDE').tokens
['▁ABC', 'DE']
sentencepiece_tokenizer.encode('ABCDE abc lovit').tokens
['▁ABC',
 'DE',
 '▁',
 '<unk>',
 '<unk>',
 '<unk>',
 '▁',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>']

Character BPE Tokenizer

charbpe_tokenizer = CharBPETokenizer(suffix='</w>')
charbpe_tokenizer.train(
    files = [small_corpus],
    vocab_size = 15,
    min_frequency = 1
)
charbpe_tokenizer.encode('ABCDE.ABC').tokens
['AB', 'C', 'DE</w>', 'ABC</w>']
charbpe_tokenizer = CharBPETokenizer(
    suffix='</w>',
    split_on_whitespace_only = True
)
charbpe_tokenizer.train(
    files = [small_corpus],
    vocab_size = 15,
    min_frequency = 1
)
charbpe_tokenizer.encode('ABCDE.ABC').tokens
['AB', 'C', 'D', 'E', 'ABC</w>']
charbpe_tokenizer = CharBPETokenizer(
    suffix='</w>',
    split_on_whitespace_only = True,
    unk_token=
)
charbpe_tokenizer.train(
    files = [small_corpus],
    vocab_size = 15,
    min_frequency = 1
)
charbpe_tokenizer.encode('ABCDE.ABC').tokens
  File "<ipython-input-21-ff7b1b8090ee>", line 5
    )
    ^
SyntaxError: invalid syntax
charbpe_tokenizer.encode('ABCDEFGH').tokens
['AB', 'C', 'D', 'E', 'F']

Byte-level BPE Tokenizer

# OpenAI GPT2 tokenizer
bytebpe_tokenizer = ByteLevelBPETokenizer(
    add_prefix_space = False,
    lowercase = False,
)
bytebpe_tokenizer.train(
    files = [small_corpus],
    vocab_size = 1000,
    min_frequency = 1
)
vocab = bytebpe_tokenizer.get_vocab()
print(sorted(vocab, key=lambda x: vocab[x]))
['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '¡', '¢', '£', '¤', '¥', '¦', '§', '¨', '©', 'ª', '«', '¬', '®', '¯', '°', '±', '²', '³', '´', 'µ', '¶', '·', '¸', '¹', 'º', '»', '¼', '½', '¾', '¿', 'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', '×', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', '÷', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'þ', 'ÿ', 'Ā', 'ā', 'Ă', 'ă', 'Ą', 'ą', 'Ć', 'ć', 'Ĉ', 'ĉ', 'Ċ', 'ċ', 'Č', 'č', 'Ď', 'ď', 'Đ', 'đ', 'Ē', 'ē', 'Ĕ', 'ĕ', 'Ė', 'ė', 'Ę', 'ę', 'Ě', 'ě', 'Ĝ', 'ĝ', 'Ğ', 'ğ', 'Ġ', 'ġ', 'Ģ', 'ģ', 'Ĥ', 'ĥ', 'Ħ', 'ħ', 'Ĩ', 'ĩ', 'Ī', 'ī', 'Ĭ', 'ĭ', 'Į', 'į', 'İ', 'ı', 'IJ', 'ij', 'Ĵ', 'ĵ', 'Ķ', 'ķ', 'ĸ', 'Ĺ', 'ĺ', 'Ļ', 'ļ', 'Ľ', 'ľ', 'Ŀ', 'ŀ', 'Ł', 'ł', 'Ń', 'ĠA', 'ĠAB', 'DE', 'ĠABC', 'AB', 'CDE', 'ĠAC', 'ĠAF', 'ĠABD', 'ABCDE']
bytebpe_tokenizer.encode('ABCDE ABC').tokens
['ABCDE', 'ĠABC']

코로나19 관련 뉴스를 학습해 보자.

from tokenizers import (ByteLevelBPETokenizer,
                        CharBPETokenizer,
                        SentencePieceBPETokenizer,
                        BertWordPieceTokenizer)


corpus_path = '../data/2020-07-29_covid_news_sents.txt'
add_prefix_space = True
vocab_size = 3000

byte_level_bpe_tokenizer = ByteLevelBPETokenizer()
byte_level_bpe_tokenizer.train(files=[corpus_path], vocab_size=vocab_size)
byte_level_bpe_tokenizer.save_model(directory='./tokenizers/ByteLevelBPETokenizer/', name='covid')

# ['./tokenizers/ByteLevelBPETokenizer/covid-vocab.json',
#  './tokenizers/ByteLevelBPETokenizer/covid-merges.txt']
['./tokenizers/ByteLevelBPETokenizer/covid-vocab.json',
 './tokenizers/ByteLevelBPETokenizer/covid-merges.txt']
char_bpe_tokenizer = CharBPETokenizer()
char_bpe_tokenizer.train(files=[corpus_path], vocab_size=vocab_size)
char_bpe_tokenizer.save_model(directory='./tokenizers/CharBPETokenizer/', name='covid')

# ['./tokenizers/CharBPETokenizer/covid-vocab.json',
#  './tokenizers/CharBPETokenizer/covid-merges.txt']
['./tokenizers/CharBPETokenizer/covid-vocab.json',
 './tokenizers/CharBPETokenizer/covid-merges.txt']
sentencepiece_bpe_tokenizer = SentencePieceBPETokenizer()
sentencepiece_bpe_tokenizer.train(files=[corpus_path], vocab_size=vocab_size)
sentencepiece_bpe_tokenizer.save_model(directory='./tokenizers/SentencePieceBPETokenizer/', name='covid')

# ['./tokenizers/SentencePieceBPETokenizer/covid-vocab.json',
#  './tokenizers/SentencePieceBPETokenizer/covid-merges.txt']
['./tokenizers/SentencePieceBPETokenizer/covid-vocab.json',
 './tokenizers/SentencePieceBPETokenizer/covid-merges.txt']
bert_wordpiece_tokenizer = BertWordPieceTokenizer()
bert_wordpiece_tokenizer.train(files=[corpus_path], vocab_size=vocab_size)
bert_wordpiece_tokenizer.save_model(directory='./tokenizers/BertWordPieceTokenizer/', name='covid')

# ['./tokenizers/BertWordPieceTokenizer/covid-vocab.txt']
['./tokenizers/BertWordPieceTokenizer/covid-vocab.txt']
sent_ko = '신종 코로나바이러스 감염증(코로나19) 사태가 심각합니다'
tokenizers = [bert_wordpiece_tokenizer,
              sentencepiece_bpe_tokenizer,
              char_bpe_tokenizer,
              byte_level_bpe_tokenizer]

for tokenizer in tokenizers:
    encode_single = tokenizer.encode(sent_ko)
    print(f'\n{tokenizer.__class__.__name__}')
    print(f'tokens = {encode_single.tokens}')
    print(f'tokens = {encode_single.ids}')
BertWordPieceTokenizer
tokens = ['신종', '코로나바이러스', '감염증', '(', '코로나19', ')', '사태', '##가', '심', '##각', '##합니다']
tokens = [1264, 1275, 1296, 12, 901, 13, 1605, 817, 1561, 1208, 2571]

SentencePieceBPETokenizer
tokens = ['▁신종', '▁코로나바이러스', '▁감염증(코로나19)', '▁사태', '가', '▁심', '각', '합', '니다']
tokens = [1246, 1235, 1275, 1493, 113, 1469, 114, 945, 2633]

CharBPETokenizer
tokens = ['신종</w>', '코로나바이러스</w>', '감염증</w>', '(</w>', '코로나19</w>', ')</w>', '사태', '가</w>', '심', '각', '합니다</w>']
tokens = [1946, 1956, 1948, 1843, 1884, 1821, 2198, 1014, 589, 115, 2480]

ByteLevelBPETokenizer
tokens = ['ìĭłì¢ħ', 'Ġì½Ķë¡ľëĤĺë°ĶìĿ´ëŁ¬ìĬ¤', 'Ġê°IJìĹ¼ì¦Ŀ', '(', 'ì½Ķë¡ľëĤĺ', '19', ')', 'ĠìĤ¬íĥľ', 'ê°Ģ', 'Ġìĭ¬', 'ê°ģ', 'íķ©ëĭĪëĭ¤']
tokens = [2472, 875, 898, 7, 616, 397, 8, 1233, 291, 1235, 784, 2247]

학습한 토크나이저를 transformers 에서 이용하자

from transformers import BertTokenizer, GPT2Tokenizer

transformers_bert_tokenizer = BertTokenizer(
    vocab_file = './tokenizers/BertWordPieceTokenizer/covid-vocab.txt'
)
print(f'tokenizers  : {bert_wordpiece_tokenizer.encode(sent_ko).tokens}')
print(f'transformers: {transformers_bert_tokenizer.tokenize(sent_ko)}')
tokenizers  : ['신종', '코로나바이러스', '감염증', '(', '코로나19', ')', '사태', '##가', '심', '##각', '##합니다']
transformers: ['신종', '코로나바이러스', '감염증', '(', '코로나19', ')', '사태', '##가', '심', '##각', '##합니다']
from unicodedata import normalize

print(normalize('NFKD', '가감'))       # 가감 ; 출력 시 글자를 재조합해서 보여줌
print(len(normalize('NFKD', '가감')))  # 5
print(normalize('NFKC', normalize('NFKD', '가감')))      # 가감
print(len(normalize('NFKC', normalize('NFKD', '가감')))) # 2
가감
5
가감
2
def compose(tokens):
    return [normalize('NFKC', token) for token in tokens]

print(f'tokenizers  : {compose(bert_wordpiece_tokenizer.encode(sent_ko).tokens)}')
print(f'transformers: {compose(transformers_bert_tokenizer.tokenize(sent_ko))}')
tokenizers  : ['신종', '코로나바이러스', '감염증', '(', '코로나19', ')', '사태', '##가', '심', '##각', '##합니다']
transformers: ['신종', '코로나바이러스', '감염증', '(', '코로나19', ')', '사태', '##가', '심', '##각', '##합니다']
transformers_gpt2_tokenizer = GPT2Tokenizer(
    vocab_file = './tokenizers/ByteLevelBPETokenizer/covid-vocab.json',
    merges_file = './tokenizers/ByteLevelBPETokenizer/covid-merges.txt'
)
print(f'tokenizers  : {byte_level_bpe_tokenizer.encode(sent_ko).tokens}')
print(f'transformers: {transformers_gpt2_tokenizer.tokenize(sent_ko)}')
tokenizers  : ['ìĭłì¢ħ', 'Ġì½Ķë¡ľëĤĺë°ĶìĿ´ëŁ¬ìĬ¤', 'Ġê°IJìĹ¼ì¦Ŀ', '(', 'ì½Ķë¡ľëĤĺ', '19', ')', 'ĠìĤ¬íĥľ', 'ê°Ģ', 'Ġìĭ¬', 'ê°ģ', 'íķ©ëĭĪëĭ¤']
transformers: ['ìĭłì¢ħ', 'Ġì½Ķë¡ľëĤĺë°ĶìĿ´ëŁ¬ìĬ¤', 'Ġê°IJìĹ¼ì¦Ŀ', '(', 'ì½Ķë¡ľëĤĺ', '19', ')', 'ĠìĤ¬íĥľ', 'ê°Ģ', 'Ġìĭ¬', 'ê°ģ', 'íķ©ëĭĪëĭ¤']
print(compose(transformers_bert_tokenizer.tokenize('lovit 이란 이름은 인식을 안합니다')))
print(compose(transformers_bert_tokenizer.tokenize('lovit 이란 이름은 인식을 안했어')))
['l', '##o', '##v', '##i', '##t', '이라', '##ᆫ', '이', '##름', '##은', '인', '##식을', '안', '##합니다']
['l', '##o', '##v', '##i', '##t', '이라', '##ᆫ', '이', '##름', '##은', '인', '##식을', '안', '##했', '##어']
@sansanai
Copy link

질문있습니다. 위 설명 중에서, 코로나 19 관련 뉴스를 학습해 보자 부분에서요.. BertWordPieceTokenizer를 제외한 나머지 세개의 Tokernizer의 save_model 의 결과로 covid-vocab.json 과 covid-merges.txt 파일 두가지가 생성되는 것 같습니다. 파일명으로 유추해볼때, covid-vocab.json은 단어사전관련 json 파일 인 것 같은데, covid-merges.txt는 어떤 파일인지 궁금합니다. 답변에 미리 감사드립니다.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment