Created
October 19, 2013 13:05
-
-
Save chappyhome/7055514 to your computer and use it in GitHub Desktop.
MMseg中文分词 Chinese Segment On MMSeg Algorithm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding:utf-8 | |
#!/usr/bin/env python | |
import xapian | |
import sys | |
import string | |
from collections import defaultdict | |
from mmseg.search import seg_txt_search,seg_txt_2_dict | |
import xapian | |
DBPATH = '/tmp/db/' | |
SEARCH_DB = xapian.WritableDatabase(DBPATH, xapian.DB_CREATE_OR_OPEN) | |
SEARCH_ENQUIRE = xapian.Enquire(SEARCH_DB) | |
def index_txt(id, txt): | |
doc = xapian.Document() | |
for word, value in seg_txt_2_dict(txt).iteritems(): | |
doc.add_term(word, value) | |
key = ":%s"%id | |
doc.add_term(key) | |
SEARCH_DB.replace_document(key, doc) | |
def flush_db(): | |
SEARCH_DB.flush() | |
if __name__ == "__main__": | |
txt = """ | |
治安署地最高长官站在街头,皱眉看着一队近卫军飞快地走过,他心中满是疑惑,立刻回到了治安署里地办公室,然后喊来了自己地一个部下,让他立刻去军方统帅部请示一下. | |
""" | |
index_txt(1, txt) | |
flush_db() | |
#coding:utf-8 | |
from mmseg.search import seg_txt_search,seg_txt_2_dict | |
import xapian | |
DBPATH = '/tmp/db/' | |
SEARCH_DB = xapian.WritableDatabase(DBPATH, xapian.DB_CREATE_OR_OPEN) | |
SEARCH_ENQUIRE = xapian.Enquire(SEARCH_DB) | |
def search(keywords, offset=0, limit=35, enquire=SEARCH_ENQUIRE): | |
query_list = [] | |
for word, value in seg_txt_2_dict(keywords).iteritems(): | |
query = xapian.Query(word, value) | |
query_list.append(query) | |
if len(query_list) != 1: | |
query = xapian.Query(xapian.Query.OP_AND, query_list) | |
else: | |
query = query_list[0] | |
enquire.set_query(query) | |
matches = enquire.get_mset(offset, limit, None) | |
return matches | |
if __name__ == "__main__": | |
matches = search( "治安") | |
# Display the results. | |
print "%i results found." % matches.get_matches_estimated() | |
print "Results 1-%i:" % matches.size() | |
for m in matches: | |
print "%i: %i%% docid=%i [%s]" % (m.rank + 1, m.percent, m.docid, m.document.get_data()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment