Last active
November 17, 2022 07:12
-
-
Save korakot/ad3e594db148c88a3e2b83618af07c76 to your computer and use it in GitHub Desktop.
Using custom word list with pythainlp Tokenizer class. Also with spaCy.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# from https://www.facebook.com/groups/thainlp/permalink/721765038204989/ | |
import pythainlp | |
from pythainlp import Tokenizer | |
# 2 ways to create it | |
tokenizer = Tokenizer(["word1", "word2"]) | |
tokenizer = Tokenizer("path/dict.txt") | |
# default word list | |
words = pythainlp.corpus.thai_words() | |
# ttc word list | |
from pythainlp.corpus import ttc | |
words = [w for w,_ in ttc.word_freqs()] | |
# can edit words here first | |
tok = Tokenizer(words) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
!pip install pythainlp | |
import pythainlp | |
from pythainlp.corpus import ttc | |
# create custom tokenizer | |
min_words = [w for w,_ in ttc.word_freqs()] # can add to it | |
tok = pythainlp.Tokenizer(min_words) | |
# use it with spaCy | |
from spacy.lang.th import Thai | |
nlp = Thai() | |
nlp.tokenizer.word_tokenize = tok.word_tokenize # change to custom | |
list(nlp('ฝนตกที่ทะเล')) | |
# ['ฝน', 'ตก', 'ที่', 'ทะเล'] because no 'ฝนตก' in min_words |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# just use word_tokenize, but with custom dict | |
!pip install pythainlp -q | |
import pythainlp | |
from pythainlp.tokenize import word_tokenize, newmm | |
from pythainlp.corpus import ttc | |
words = [w for w,_ in ttc.word_freqs()] | |
trie = pythainlp.util.Trie(words) | |
newmm.DEFAULT_WORD_DICT_TRIE = trie | |
word_tokenize('ฝนตกทั่วฟ้า') # ['ฝน', 'ตก', 'ทั่ว', 'ฟ้า'] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment