Skip to content

Instantly share code, notes, and snippets.

@korakot
Last active November 17, 2022 07:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save korakot/ad3e594db148c88a3e2b83618af07c76 to your computer and use it in GitHub Desktop.
Save korakot/ad3e594db148c88a3e2b83618af07c76 to your computer and use it in GitHub Desktop.
Using custom word list with pythainlp Tokenizer class. Also with spaCy.
# from https://www.facebook.com/groups/thainlp/permalink/721765038204989/
import pythainlp
from pythainlp import Tokenizer
# 2 ways to create it
tokenizer = Tokenizer(["word1", "word2"])
tokenizer = Tokenizer("path/dict.txt")
# default word list
words = pythainlp.corpus.thai_words()
# ttc word list
from pythainlp.corpus import ttc
words = [w for w,_ in ttc.word_freqs()]
# can edit words here first
tok = Tokenizer(words)
!pip install pythainlp
import pythainlp
from pythainlp.corpus import ttc
# create custom tokenizer
min_words = [w for w,_ in ttc.word_freqs()] # can add to it
tok = pythainlp.Tokenizer(min_words)
# use it with spaCy
from spacy.lang.th import Thai
nlp = Thai()
nlp.tokenizer.word_tokenize = tok.word_tokenize # change to custom
list(nlp('ฝนตกที่ทะเล'))
# ['ฝน', 'ตก', 'ที่', 'ทะเล'] because no 'ฝนตก' in min_words
# just use word_tokenize, but with custom dict
!pip install pythainlp -q
import pythainlp
from pythainlp.tokenize import word_tokenize, newmm
from pythainlp.corpus import ttc
words = [w for w,_ in ttc.word_freqs()]
trie = pythainlp.util.Trie(words)
newmm.DEFAULT_WORD_DICT_TRIE = trie
word_tokenize('ฝนตกทั่วฟ้า') # ['ฝน', 'ตก', 'ทั่ว', 'ฟ้า']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment