import huggingface_konlpy
from huggingface_konlpy .tokenizers_konlpy import KoNLPyPreTokenizer
from huggingface_konlpy .tokenizers_konlpy import KoNLPyPretokBertWordPieceTokenizer
from huggingface_konlpy .transformers_konlpy import KoNLPyPretokBertTokenizer
sent_ko = '신종 코로나바이러스 감염증(코로나19) 사태가 심각합니다'
corpus_path = '../data/2020-07-29_covid_news_sents.txt'
vocab_size = 3000
from konlpy .tag import Komoran
konlpy_pretok = KoNLPyPreTokenizer (Komoran ())
print (konlpy_pretok (sent_ko ))
신종 코로나바이러스 감염증 ( 코로나 19 ) 사태 가 심각 하 ㅂ니다
from tokenizers import BertWordPieceTokenizer
bert_wordpiece_tokenizer = BertWordPieceTokenizer ()
bert_wordpiece_tokenizer .train (
files = ['../data/2020-07-29_covid_news_sents.komoran.txt' ],
vocab_size = 3000
)
bert_wordpiece_tokenizer .save_model (
directory = './tokenizers/KomoranBertWordPieceTokenizer/' ,
name = 'covid'
)
['./tokenizers/KomoranBertWordPieceTokenizer/covid-vocab.txt']
from huggingface_konlpy import compose
konlpy_bert_wordpiece_tokenizer = KoNLPyPretokBertWordPieceTokenizer (
konlpy_pretok ,
vocab_file = './tokenizers/KomoranBertWordPieceTokenizer/covid-vocab.txt'
)
print (compose (konlpy_bert_wordpiece_tokenizer .encode (sent_ko , add_special_tokens = False ).tokens ))
print (compose (bert_wordpiece_tokenizer .encode (sent_ko ).tokens ))
['신종', '코로나바이러스', '감염증', '(', '코로나', '19', ')', '사태', '가', '심각', '하', 'ᄇ니다']
['신종', '코로나바이러스', '감염증', '(', '코로나', '##1', '##9', ')', '사태', '##가', '심각', '##합', '##니다']
konlpy_bert_tokenizer = KoNLPyPretokBertTokenizer (
konlpy_pretok , './tokenizers/KomoranBertWordPieceTokenizer/covid-vocab.txt'
)
print (compose (konlpy_bert_tokenizer .tokenize (sent_ko )))
['신종', '코로나바이러스', '감염증', '(', '코로나', '19', ')', '사태', '가', '심각', '하', 'ᄇ니다']
KoNLPy WordPiece Tokenizer
from huggingface_konlpy .tokenizers_konlpy import KoNLPyWordPieceTokenizer
from huggingface_konlpy .tokenizers_konlpy import KoNLPyTokenizersTokenizer
from huggingface_konlpy .transformers_konlpy import KoNLPyBertTokenizer
from konlpy .tag import Mecab
mecab_wordpiece_notag = KoNLPyWordPieceTokenizer (Mecab (), use_tag = False )
print (mecab_wordpiece_notag .tokenize (sent_ko ))
['신종', '코로나', '##바이러스', '감염증', '##(', '##코로나', '##19', '##)', '사태', '##가', '심각', '##합니다']
mecab_wordpiece_notag_trainer = KoNLPyTokenizersTokenizer (Mecab (), use_tag = False )
mecab_wordpiece_notag_trainer .train (
files = ['../data/2020-07-29_covid_news_sents.txt' ]
)
mecab_wordpiece_notag_trainer .save_model ('./tokenizers/BertStyleMecab/' , 'notag' )
konlpy_bert_notag = KoNLPyBertTokenizer (
konlpy_wordpiece = KoNLPyWordPieceTokenizer (Mecab (), use_tag = False ),
vocab_file = './tokenizers/BertStyleMecab/notag-vocab.txt'
)
print (konlpy_bert_notag .tokenize (sent_ko ))
Initialize alphabet 1/1: 100%|██████████| 70964/70964 [00:00<00:00, 89579.94it/s]
Train vocab 1/1: 100%|██████████| 70964/70964 [00:14<00:00, 4936.72it/s]
[/mnt/lovit/git/transformers_konlpy_trainer/tutorials/tokenizers/BertStyleMecab/notag-vocab.txt]
['신종', '코로나', '##바이러스', '감염증', '##(', '##코로나', '##19', '##)', '사태', '##가', '심각', '##합니다']
mecab_wordpiece_usetag = KoNLPyWordPieceTokenizer (Mecab (), use_tag = True )
print (mecab_wordpiece_usetag .tokenize (sent_ko ))
['신종/NNG', '코로나/NNP', '##바이러스/NNG', '감염증/NNG', '##(/SSO', '##코로나/NNP', '##19/SN', '##)/SSC', '사태/NNG', '##가/JKS', '심각/XR', '##합니다/XSA+EC']
mecab_wordpiece_usetag_trainer = KoNLPyTokenizersTokenizer (Mecab (), use_tag = True )
mecab_wordpiece_usetag_trainer .train (
files = ['../data/2020-07-29_covid_news_sents.txt' ]
)
mecab_wordpiece_usetag_trainer .save_model ('./tokenizers/BertStyleMecab/' , 'usetag' )
konlpy_bert_usetag = KoNLPyBertTokenizer (
konlpy_wordpiece = KoNLPyWordPieceTokenizer (Mecab (), use_tag = True ),
vocab_file = './tokenizers/BertStyleMecab/usetag-vocab.txt'
)
print (konlpy_bert_usetag .tokenize (sent_ko ))
Initialize alphabet 1/1: 100%|██████████| 70964/70964 [00:00<00:00, 89950.42it/s]
Train vocab 1/1: 100%|██████████| 70964/70964 [00:14<00:00, 4939.54it/s]
[/mnt/lovit/git/transformers_konlpy_trainer/tutorials/tokenizers/BertStyleMecab/usetag-vocab.txt]
['신종/NNG', '코로나/NNP', '##바이러스/NNG', '감염증/NNG', '##(/SSO', '##코로나/NNP', '##19/SN', '##)/SSC', '사태/NNG', '##가/JKS', '심각/XR', '합', '니', '다']