Created
April 2, 2013 20:20
-
-
Save diiq/5295824 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Builds a character/word-indexed tree of files.""" | |
import itertools | |
import glob | |
punctuation = ".,;:/?<>{}[]()-_=+!@#$%^&*\/\\~*" # TODO be more systematic | |
def words_in_file(handle, max_len): | |
"""Returns an alphabetized list of unique words in the file. | |
handle is a file handle with read access | |
max_len is an int, the max length of word we're interested in indexing. | |
""" | |
lines = handle.readlines() | |
words = itertools.chain(*[line.rstrip().rsplit(" ") for line in lines]) | |
words = (word.rstrip(punctuation).lstrip(punctuation)[:max_len].upper() for word in words) | |
return sorted(list(set(words))) | |
# OK, now what I'm after is adding a file to the index in all appropriate places. | |
# I'm going to start by being dumb, and just have the whole index floating in RAM on this end. | |
class IndexEntry(object): | |
def __init__(self, word): | |
self.word = word | |
# index is a dictionary of characters to IndexEntries | |
self.index = {} | |
# files is a list of filenames which contain the word indexed by IndexEntry. | |
self.files = [] | |
def add_file_word(self, filename, word): | |
index = self | |
if word == "": | |
return | |
for i, letter in enumerate(word): | |
if not letter in index.index: | |
index.index[letter] = IndexEntry(word) | |
index = index.index[letter] | |
index.files.append(filename); | |
def __str__(self, depth=0): | |
rstr = " " * depth * 2 + self.word + " " + str(len(self.files)) | |
rstr = rstr + "\n" + "\n".join([v.__str__(depth + 1) for v in self.index.itervalues()]) | |
return rstr | |
def write_list(self, list_handle): | |
self.start_loc = list_handle.tell() | |
for filename in self.files: | |
list_handle.write("%s %s\n" % (self.word, filename)) | |
for entry in self.index.itervalues(): | |
entry.write_list(list_handle) | |
self.end_loc = list_handle.tell() | |
def write_index(self, index_handle): | |
# How much space will this particular index take up? | |
# TODO magic nums | |
index_size = 18 * len(self.index.values()) + 34 | |
index_loc = index_handle.tell() | |
# reserve space for this index | |
index_handle.write(" " * index_size) | |
addresses = {} | |
for char, entry in self.index.iteritems(): | |
addresses[char] = index_handle.tell() | |
entry.write_index() | |
# now write the index to the reserved space | |
tmp = index_handle.tell() | |
index_handle.seek(index_loc, 0) | |
# TODO this should be a byte-value, not a string. Idiot. | |
index_handle.write(" ".join(["%s%16d" % pair for pair in address.iteritems()])) | |
index_handle.write(" %32d %32d\n" % self.start_loc, self.end_loc) | |
def build_index(filenames, depth): | |
index = IndexEntry("") | |
for filename in filenames: | |
handle = open(filename); | |
for word in words_in_file(handle, depth): | |
index.add_file_word(filename, word) | |
return index | |
def list_files(root, depth): | |
# Silly and HACK &c. | |
return glob.glob(root + ("/*" * depth)) | |
if __name__ == '__main__': | |
files = list_files("/home/diiq/Downloads/enron_mail_20110402/maildir", 3) | |
ind = build_index(files[:100], 3); | |
outfile = "foo.lst" | |
indfile = "foo.ind" | |
ind.write_list(open(outfile, 'w'), None) | |
ind.write_(open(outfile, 'w'), None) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# TOTALLY a sketch. Incomplete, untested, and DOES NOT currently match output format of build_index. | |
# a sketch ONLY. | |
"""Searches a character/word indexed tree of files for a given string.""" | |
# File of indices (smaller) | |
# header_length charloc charloc... loc | |
# File of documents | |
# word filename\n... | |
# OK, in truth, you'd want multiple file pointers, and some stuff, but it's a place to start. | |
# TODO: print the words for the results. | |
def load_index(handle): | |
header_len = handle.read(1) | |
header = handle.read(header_len) | |
index = {} | |
for pair in index.rsplit(): | |
index[pair[0]] = int(pair[1:]) | |
return index | |
def next_index(index, letter, handle): | |
if not letter in index: | |
return False | |
handle.seek(index[letter].location) | |
return load_index(handle) | |
# Doesn't reset handle's location. | |
def lookup_partial(word, handle): | |
index = load_index(handle) | |
for letter in word: | |
index = next_index(loc, letter) | |
if not index: | |
return False | |
return index | |
def list_results(index, handle): | |
# First pick out the results that match exactly | |
# Bang for word as-is | |
handle.seek(index["!"]) | |
handle.readline(1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment