This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ตาม guru.sanook.com/1520 | |
import re | |
t1 = str.maketrans("กขฃคฅฆงจฉชฌซศษสญยฎดฏตณนฐฑฒถทธบปผพภฝฟมรลฬฤฦวหฮอ", | |
"กกกกกกงจชชชซซซซยยดดตตนนททททททบปพพพฟฟมรรรรรวหหอ") | |
t2 = str.maketrans( | |
"กขฃคฅฆงจฉชซฌฎฏฐฑฒดตถทธศษสญณนรลฬฤฦบปพฟภผฝมำยวไใหฮาๅึืเแโุูอ", | |
"1111112333333333333333333444444445555555667777889AAABCDEEF") | |
def LK82(s): | |
res = [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def listcut(text): | |
''' | |
ใช้หาคำที่สามารถตัดได้ทั้งหมดโดยจะเป็น list โดยมี | เป็นตัวแบ่งใน list | |
''' | |
listdata = list(tcut(text)) | |
listdata1=['']*len(listdata) | |
i=0 | |
maxnum=0 | |
numall=1 | |
listnum=[0]*len(listdata) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from bottle import route, run, template,get, post, request | |
import csv | |
import codecs | |
@route('/') | |
def home(): | |
return ''' | |
<!DOCTYPE html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pythainlp.tokenize import word_tokenize # ใช้ในการตัดคำ | |
from pythainlp.tag import pos_tag | |
from nltk import RegexpParser | |
chunker = RegexpParser(""" | |
Bank: | |
{<NCMN>+} | |
""") | |
text='ธนาคารบัวเทพร่วมมือกันแก้ไขปัญหาทางการเงินของไทย' | |
a=word_tokenize(text,engine='mm') | |
b=pos_tag(a,engine='artagger') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
def arrUnicode(myArr): | |
uniStr = [unicode(i, encoding='UTF-8') if isinstance(i, basestring) else i for i in myArr] | |
s = repr(uniStr).decode('unicode_escape').encode('utf-8') | |
if s.startswith("[u'"): | |
s2 = s.replace("u'", "'") | |
elif s.startswith('[u"'): | |
s2 = s.replace('u"', '"') | |
else: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> import nlpnet | |
>>> path="ที่ตั้งโฟลเดอร์ที่เก็บไฟล์train" | |
>>> nlpnet.set_data_dir(path) | |
>>> tagger = nlpnet.POSTagger() | |
# โหลดฟังก์ชัน arrUnicode แสดงผลภาษาไทยใน Python 2.7 ได้จาก https://gist.github.com/wannaphongcom/266e340b80b0b21e11f4f768965fe8b0 | |
>>> print arrUnicode(tagger.tag('คุณ เป็น ครู ภาษาไทย ใช่ไหม')) | |
[[(u'คุณ', u'JSBR'), (u'เป็น', u'VSTA'), (u'ครู', u'NCMN'), (u'ภาษาไทย', u'VATT'), (u'ใช่ไหม', u'NCMN')]] | |
>>> print arrUnicode(tagger.tag('คุณ เป็น คุณครู ภาษาไทย ใช่ไหม')) | |
[[(u'คุณ', u'JSBR'), (u'เป็น', u'VSTA'), (u'คุณครู', u'NCMN'), (u'ภาษาไทย', u'NCMN'), (u'ใช่ไหม', u'NCMN')]] | |
>>> print arrUnicode(tagger.tag('สวัสดี ครับ ทุกคน สบายดี กัน ไหม ครับ ตอนนี้ ผม ง่วง นอน มาก เลย แล ค ล่ะ')) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
โค้ดทดสอบก่อนถูกนำไปรวมกับ PyThaiNLP | |
ใช้ Apache License 2.0 | |
เขียนโดย นาย วรรณพงษ์ ภัททิยไพบูลย์ | |
""" | |
import re | |
""" | |
หลักการทำงาน | |
----------- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
โค้ดทดสอบก่อนถูกนำไปรวมกับ PyThaiNLP | |
ใช้ Apache License 2.0 | |
เขียนโดย นาย วรรณพงษ์ ภัททิยไพบูลย์ | |
""" | |
import re | |
""" | |
หลักการทำงาน | |
----------- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
"""Print most frequent N-grams in given file. | |
Usage: python ngrams.py filename | |
Problem description: Build a tool which receives a corpus of text, | |
analyses it and reports the top 10 most frequent bigrams, trigrams, | |
four-grams (i.e. most frequently occurring two, three and four word | |
consecutive combinations). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> import spacy | |
>>> th_nlp = spacy.load('th') | |
>>> text="คุณรักผมไหม" | |
>>> a= th_nlp(text) | |
>>> a | |
คุณรักผมไหม | |
>>> list(a) | |
[คุณ, รัก, ผม, ไหม] |