Skip to content

Instantly share code, notes, and snippets.

@w32zhong
Created April 21, 2016 22:33
Show Gist options
  • Save w32zhong/3ebc634461e64d8c783fba9e0bb33540 to your computer and use it in GitHub Desktop.
Save w32zhong/3ebc634461e64d8c783fba9e0bb33540 to your computer and use it in GitHub Desktop.
get search term tokens with position and tag info
#encoding=utf-8
from jieba.posseg import POSTokenizer
import sys
import jieba
utf8_str = "其实,工信处女干事microsoft每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
posseg_tk = None
def cjieba_init():
global posseg_tk
posseg_tk = POSTokenizer(jieba.dt)
def cjieba_tokenize(utf8_txt):
result = jieba.dt.tokenize(utf8_txt, mode="search")
for tk in result:
tag = posseg_tk.word_tag_tab.get(tk[0], 'x')
if tag == 'x':
tmp = posseg_tk._POSTokenizer__cut_detail(tk[0])
_, tag = next(tmp)
yield (tk[0], tk[1], tk[2], tag);
def cjieba_print(gen):
for a,b,c,d in gen:
print("%s\t start: %d \t end:%d \t tag:%s" % (a,b,c,d))
cjieba_init()
print("=======init finish=======")
r = cjieba_tokenize(utf8_str)
cjieba_print(r)
print("=======2nd=======")
r = cjieba_tokenize(utf8_str)
cjieba_print(r)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment