Skip to content

Instantly share code, notes, and snippets.

@lovit
Created August 27, 2020 22:29
Show Gist options
  • Save lovit/1381722e36c097fed86af8b249ad50b2 to your computer and use it in GitHub Desktop.
Save lovit/1381722e36c097fed86af8b249ad50b2 to your computer and use it in GitHub Desktop.
Huggingface tokenizers / transformers + KoNLPy.md
import huggingface_konlpy

KoNLPy as pre-tokenizer

from huggingface_konlpy.tokenizers_konlpy import KoNLPyPreTokenizer
from huggingface_konlpy.tokenizers_konlpy import KoNLPyPretokBertWordPieceTokenizer
from huggingface_konlpy.transformers_konlpy import KoNLPyPretokBertTokenizer
sent_ko = '신종 코로나바이러스 감염증(코로나19) 사태가 심각합니다'
corpus_path = '../data/2020-07-29_covid_news_sents.txt'
vocab_size = 3000
from konlpy.tag import Komoran

konlpy_pretok = KoNLPyPreTokenizer(Komoran())
print(konlpy_pretok(sent_ko))
신종 코로나바이러스 감염증 ( 코로나 19 ) 사태 가 심각 하 ㅂ니다
from tokenizers import BertWordPieceTokenizer

bert_wordpiece_tokenizer = BertWordPieceTokenizer()
bert_wordpiece_tokenizer.train(
    files = ['../data/2020-07-29_covid_news_sents.komoran.txt'],
    vocab_size = 3000
)
bert_wordpiece_tokenizer.save_model(
    directory='./tokenizers/KomoranBertWordPieceTokenizer/',
    name='covid'
)
['./tokenizers/KomoranBertWordPieceTokenizer/covid-vocab.txt']
from huggingface_konlpy import compose

konlpy_bert_wordpiece_tokenizer = KoNLPyPretokBertWordPieceTokenizer(
    konlpy_pretok,
    vocab_file = './tokenizers/KomoranBertWordPieceTokenizer/covid-vocab.txt'
)

print(compose(konlpy_bert_wordpiece_tokenizer.encode(sent_ko, add_special_tokens=False).tokens))
print(compose(bert_wordpiece_tokenizer.encode(sent_ko).tokens))
['신종', '코로나바이러스', '감염증', '(', '코로나', '19', ')', '사태', '가', '심각', '하', 'ᄇ니다']
['신종', '코로나바이러스', '감염증', '(', '코로나', '##1', '##9', ')', '사태', '##가', '심각', '##합', '##니다']
konlpy_bert_tokenizer = KoNLPyPretokBertTokenizer(
    konlpy_pretok, './tokenizers/KomoranBertWordPieceTokenizer/covid-vocab.txt'
)
print(compose(konlpy_bert_tokenizer.tokenize(sent_ko)))
['신종', '코로나바이러스', '감염증', '(', '코로나', '19', ')', '사태', '가', '심각', '하', 'ᄇ니다']

KoNLPy WordPiece Tokenizer

from huggingface_konlpy.tokenizers_konlpy import KoNLPyWordPieceTokenizer
from huggingface_konlpy.tokenizers_konlpy import KoNLPyTokenizersTokenizer
from huggingface_konlpy.transformers_konlpy import KoNLPyBertTokenizer
from konlpy.tag import Mecab

mecab_wordpiece_notag = KoNLPyWordPieceTokenizer(Mecab(), use_tag=False)
print(mecab_wordpiece_notag.tokenize(sent_ko))
['신종', '코로나', '##바이러스', '감염증', '##(', '##코로나', '##19', '##)', '사태', '##가', '심각', '##합니다']
mecab_wordpiece_notag_trainer = KoNLPyTokenizersTokenizer(Mecab(), use_tag=False)
mecab_wordpiece_notag_trainer.train(
    files = ['../data/2020-07-29_covid_news_sents.txt']
)
mecab_wordpiece_notag_trainer.save_model('./tokenizers/BertStyleMecab/', 'notag')

konlpy_bert_notag = KoNLPyBertTokenizer(
    konlpy_wordpiece = KoNLPyWordPieceTokenizer(Mecab(), use_tag=False),
    vocab_file = './tokenizers/BertStyleMecab/notag-vocab.txt'
)
print(konlpy_bert_notag.tokenize(sent_ko))
Initialize alphabet 1/1: 100%|██████████| 70964/70964 [00:00<00:00, 89579.94it/s]
Train vocab 1/1: 100%|██████████| 70964/70964 [00:14<00:00, 4936.72it/s]


[/mnt/lovit/git/transformers_konlpy_trainer/tutorials/tokenizers/BertStyleMecab/notag-vocab.txt]
['신종', '코로나', '##바이러스', '감염증', '##(', '##코로나', '##19', '##)', '사태', '##가', '심각', '##합니다']
mecab_wordpiece_usetag = KoNLPyWordPieceTokenizer(Mecab(), use_tag=True)
print(mecab_wordpiece_usetag.tokenize(sent_ko))
['신종/NNG', '코로나/NNP', '##바이러스/NNG', '감염증/NNG', '##(/SSO', '##코로나/NNP', '##19/SN', '##)/SSC', '사태/NNG', '##가/JKS', '심각/XR', '##합니다/XSA+EC']
mecab_wordpiece_usetag_trainer = KoNLPyTokenizersTokenizer(Mecab(), use_tag=True)
mecab_wordpiece_usetag_trainer.train(
    files = ['../data/2020-07-29_covid_news_sents.txt']
)
mecab_wordpiece_usetag_trainer.save_model('./tokenizers/BertStyleMecab/', 'usetag')

konlpy_bert_usetag = KoNLPyBertTokenizer(
    konlpy_wordpiece = KoNLPyWordPieceTokenizer(Mecab(), use_tag=True),
    vocab_file = './tokenizers/BertStyleMecab/usetag-vocab.txt'
)
print(konlpy_bert_usetag.tokenize(sent_ko))
Initialize alphabet 1/1: 100%|██████████| 70964/70964 [00:00<00:00, 89950.42it/s]
Train vocab 1/1: 100%|██████████| 70964/70964 [00:14<00:00, 4939.54it/s]


[/mnt/lovit/git/transformers_konlpy_trainer/tutorials/tokenizers/BertStyleMecab/usetag-vocab.txt]
['신종/NNG', '코로나/NNP', '##바이러스/NNG', '감염증/NNG', '##(/SSO', '##코로나/NNP', '##19/SN', '##)/SSC', '사태/NNG', '##가/JKS', '심각/XR', '합', '니', '다']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment