Skip to content

Instantly share code, notes, and snippets.

@xingkaixin
Created September 6, 2014 06:30
Show Gist options
  • Save xingkaixin/77c69d0be06e1dac51d9 to your computer and use it in GitHub Desktop.
Save xingkaixin/77c69d0be06e1dac51d9 to your computer and use it in GitHub Desktop.
Chinese words segementation uitlities
# -*- coding:utf-8 -*-
import jieba
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
def trim(mystr):
x=''
for str in mystr:
if str==' ':
continue
else :
x=x+str
return x
text = '''
Dear Kevin,
请提供一下科目的清单 223023 translation date 2013011
'''
default_mode = jieba.cut(text)
#ull_mode = jieba.cut(text,cut_all=True)
#search_mode = jieba.cut_for_search(text)
#print "精确模式:","/".join(default_mode)
#print "全模式:","/".join(full_mode)
#print "搜索引擎模式:","/".join(search_mode)
words = []
for n in default_mode:
word = trim(n).replace('\n','').replace('\t','')
if len(word) < 1:
pass
else:
words.append(word)
print words
for word in words:
print word.decode('utf-8')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment