Created
April 21, 2016 22:33
-
-
Save w32zhong/3ebc634461e64d8c783fba9e0bb33540 to your computer and use it in GitHub Desktop.
get search term tokens with position and tag info
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#encoding=utf-8 | |
from jieba.posseg import POSTokenizer | |
import sys | |
import jieba | |
utf8_str = "其实,工信处女干事microsoft每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" | |
posseg_tk = None | |
def cjieba_init(): | |
global posseg_tk | |
posseg_tk = POSTokenizer(jieba.dt) | |
def cjieba_tokenize(utf8_txt): | |
result = jieba.dt.tokenize(utf8_txt, mode="search") | |
for tk in result: | |
tag = posseg_tk.word_tag_tab.get(tk[0], 'x') | |
if tag == 'x': | |
tmp = posseg_tk._POSTokenizer__cut_detail(tk[0]) | |
_, tag = next(tmp) | |
yield (tk[0], tk[1], tk[2], tag); | |
def cjieba_print(gen): | |
for a,b,c,d in gen: | |
print("%s\t start: %d \t end:%d \t tag:%s" % (a,b,c,d)) | |
cjieba_init() | |
print("=======init finish=======") | |
r = cjieba_tokenize(utf8_str) | |
cjieba_print(r) | |
print("=======2nd=======") | |
r = cjieba_tokenize(utf8_str) | |
cjieba_print(r) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment