Skip to content

Instantly share code, notes, and snippets.

@jogonba2
Created April 12, 2015 14:46
Show Gist options
  • Save jogonba2/ed853630645654aaf745 to your computer and use it in GitHub Desktop.
Save jogonba2/ed853630645654aaf745 to your computer and use it in GitHub Desktop.
One step memory indexation -> SPIMI algorithm
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Overxfl0w13 #
# One step memory indexation -> SPIMI algorithm #
from string import ascii_uppercase,ascii_lowercase,digits
from random import choice
try: from cPickle import HIGHEST_PROTOCOL,dump,load
except: from pickle import HIGHEST_PROTOCOL,dump,load
from sys import argv
def spimi_invert(token_stream):
with open("".join(choice(ascii_uppercase+ascii_lowercase+digits) for _ in xrange(10))+".bin","wb") as output_file:
dictionary = {}
for token in token_stream:
term,docid = token[0],token[1]
if term not in dictionary: postings_list = add_to_dictionary(dictionary,term)
else: postings_list = get_postings_list(dictionary,term)
add_to_postings_list(postings_list,docid)
sorted_terms = sorted(dictionary,key = lambda tup: tup[0],reverse=True)
save_object(sorted_terms,output_file)
save_object(dictionary,output_file)
output_file.close()
def add_to_dictionary(dictionary,term):
dictionary[term] = []
return dictionary[term]
def get_postings_list(dictionary,term): return dictionary[term]
def add_to_postings_list(postings_list,docid): postings_list.insert(0,docid) if docid not in postings_list else postings_list
# Persistence #
def save_object(object,fd): dump(object,fd,HIGHEST_PROTOCOL)
def load_object(source):
with open(source,'rb') as fd: obj = load(fd)
fd.close()
return obj
# Simplified corpus process #
def spimi_corpus_process(path_corpus,file_names,block_size):
from nltk.corpus import PlaintextCorpusReader
wordlists = PlaintextCorpusReader(path_corpus,file_names,encoding='latin-1')
block = []
for fileid in wordlists.fileids():
docid = fileid[:fileid.rfind(".")][-1:]
block += [(word,docid) for word in wordlists.words(fileid)]
while len(block)!=0:
try: count = spimi_invert([block.pop() for x in xrange(block_size)])
except IndexError as ie: pass
if __name__ == '__main__':
if len(argv)<4: print "Usage spimi.py [corpus_path] [block_size] [file1,...,fileN] "; exit()
else: spimi_corpus_process(argv[1],argv[3:],int(argv[2]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment