Created
April 12, 2015 14:46
-
-
Save jogonba2/ed853630645654aaf745 to your computer and use it in GitHub Desktop.
One step memory indexation -> SPIMI algorithm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# Author: Overxfl0w13 # | |
# One step memory indexation -> SPIMI algorithm # | |
from string import ascii_uppercase,ascii_lowercase,digits | |
from random import choice | |
try: from cPickle import HIGHEST_PROTOCOL,dump,load | |
except: from pickle import HIGHEST_PROTOCOL,dump,load | |
from sys import argv | |
def spimi_invert(token_stream): | |
with open("".join(choice(ascii_uppercase+ascii_lowercase+digits) for _ in xrange(10))+".bin","wb") as output_file: | |
dictionary = {} | |
for token in token_stream: | |
term,docid = token[0],token[1] | |
if term not in dictionary: postings_list = add_to_dictionary(dictionary,term) | |
else: postings_list = get_postings_list(dictionary,term) | |
add_to_postings_list(postings_list,docid) | |
sorted_terms = sorted(dictionary,key = lambda tup: tup[0],reverse=True) | |
save_object(sorted_terms,output_file) | |
save_object(dictionary,output_file) | |
output_file.close() | |
def add_to_dictionary(dictionary,term): | |
dictionary[term] = [] | |
return dictionary[term] | |
def get_postings_list(dictionary,term): return dictionary[term] | |
def add_to_postings_list(postings_list,docid): postings_list.insert(0,docid) if docid not in postings_list else postings_list | |
# Persistence # | |
def save_object(object,fd): dump(object,fd,HIGHEST_PROTOCOL) | |
def load_object(source): | |
with open(source,'rb') as fd: obj = load(fd) | |
fd.close() | |
return obj | |
# Simplified corpus process # | |
def spimi_corpus_process(path_corpus,file_names,block_size): | |
from nltk.corpus import PlaintextCorpusReader | |
wordlists = PlaintextCorpusReader(path_corpus,file_names,encoding='latin-1') | |
block = [] | |
for fileid in wordlists.fileids(): | |
docid = fileid[:fileid.rfind(".")][-1:] | |
block += [(word,docid) for word in wordlists.words(fileid)] | |
while len(block)!=0: | |
try: count = spimi_invert([block.pop() for x in xrange(block_size)]) | |
except IndexError as ie: pass | |
if __name__ == '__main__': | |
if len(argv)<4: print "Usage spimi.py [corpus_path] [block_size] [file1,...,fileN] "; exit() | |
else: spimi_corpus_process(argv[1],argv[3:],int(argv[2])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment