Created
May 8, 2013 20:47
-
-
Save boblannon/5543531 to your computer and use it in GitHub Desktop.
script to demonstrate basic python-superfastmatch pairwise comparison of all documents in a collection
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from superfastmatch import client | |
import uuid | |
sfm_client = client.Client(url='http://127.0.0.1:9000/') | |
class Document(): | |
def __init__(self,title_string,content_string): | |
self.doc_id = uuid.uuid4() | |
self.title = title_string | |
self.content = content_string | |
self.near_neighbors = [] | |
documents = [doc_1,doc_2,doc_3] | |
for doc in documents: | |
sfm_client.add( 1, doc.doc_id, doc.content, title=doc.title) | |
for doc in documents: | |
sfm_doc = eval(sfm_client.get(1,doc.doc_id)) | |
comp_list = [] | |
try: | |
for other_doc in sfm_doc['documents']['rows']: | |
shared = 0 | |
for frag in other_doc['fragments']: | |
shared += frag[2] | |
comp_list.append((other_doc['docid'],shared)) | |
except KeyError: | |
continue | |
#this takes the top ten, but you could also do something more clever | |
comp_list = sorted(comp_list,key=lambda x: x[1],reverse=True)[0:10] | |
for i in range(min([len(comp_list),10])): | |
other_doc_id,s = comp_list[i] | |
doc.near_neighbors.append(other_doc_id) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment