Skip to content

Instantly share code, notes, and snippets.

@chappyhome
Created October 19, 2013 13:05
Show Gist options
  • Save chappyhome/7055514 to your computer and use it in GitHub Desktop.
Save chappyhome/7055514 to your computer and use it in GitHub Desktop.
MMseg中文分词 Chinese Segment On MMSeg Algorithm
#coding:utf-8
#!/usr/bin/env python
import xapian
import sys
import string
from collections import defaultdict
from mmseg.search import seg_txt_search,seg_txt_2_dict
import xapian
DBPATH = '/tmp/db/'
SEARCH_DB = xapian.WritableDatabase(DBPATH, xapian.DB_CREATE_OR_OPEN)
SEARCH_ENQUIRE = xapian.Enquire(SEARCH_DB)
def index_txt(id, txt):
doc = xapian.Document()
for word, value in seg_txt_2_dict(txt).iteritems():
doc.add_term(word, value)
key = ":%s"%id
doc.add_term(key)
SEARCH_DB.replace_document(key, doc)
def flush_db():
SEARCH_DB.flush()
if __name__ == "__main__":
txt = """
治安署地最高长官站在街头,皱眉看着一队近卫军飞快地走过,他心中满是疑惑,立刻回到了治安署里地办公室,然后喊来了自己地一个部下,让他立刻去军方统帅部请示一下.
"""
index_txt(1, txt)
flush_db()
#coding:utf-8
from mmseg.search import seg_txt_search,seg_txt_2_dict
import xapian
DBPATH = '/tmp/db/'
SEARCH_DB = xapian.WritableDatabase(DBPATH, xapian.DB_CREATE_OR_OPEN)
SEARCH_ENQUIRE = xapian.Enquire(SEARCH_DB)
def search(keywords, offset=0, limit=35, enquire=SEARCH_ENQUIRE):
query_list = []
for word, value in seg_txt_2_dict(keywords).iteritems():
query = xapian.Query(word, value)
query_list.append(query)
if len(query_list) != 1:
query = xapian.Query(xapian.Query.OP_AND, query_list)
else:
query = query_list[0]
enquire.set_query(query)
matches = enquire.get_mset(offset, limit, None)
return matches
if __name__ == "__main__":
matches = search( "治安")
# Display the results.
print "%i results found." % matches.get_matches_estimated()
print "Results 1-%i:" % matches.size()
for m in matches:
print "%i: %i%% docid=%i [%s]" % (m.rank + 1, m.percent, m.docid, m.document.get_data())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment