Skip to content

Instantly share code, notes, and snippets.

@diiq
Created April 2, 2013 20:20
Show Gist options
  • Save diiq/5295824 to your computer and use it in GitHub Desktop.
Save diiq/5295824 to your computer and use it in GitHub Desktop.
"""Builds a character/word-indexed tree of files."""
import itertools
import glob
punctuation = ".,;:/?<>{}[]()-_=+!@#$%^&*\/\\~*" # TODO be more systematic
def words_in_file(handle, max_len):
"""Returns an alphabetized list of unique words in the file.
handle is a file handle with read access
max_len is an int, the max length of word we're interested in indexing.
"""
lines = handle.readlines()
words = itertools.chain(*[line.rstrip().rsplit(" ") for line in lines])
words = (word.rstrip(punctuation).lstrip(punctuation)[:max_len].upper() for word in words)
return sorted(list(set(words)))
# OK, now what I'm after is adding a file to the index in all appropriate places.
# I'm going to start by being dumb, and just have the whole index floating in RAM on this end.
class IndexEntry(object):
def __init__(self, word):
self.word = word
# index is a dictionary of characters to IndexEntries
self.index = {}
# files is a list of filenames which contain the word indexed by IndexEntry.
self.files = []
def add_file_word(self, filename, word):
index = self
if word == "":
return
for i, letter in enumerate(word):
if not letter in index.index:
index.index[letter] = IndexEntry(word)
index = index.index[letter]
index.files.append(filename);
def __str__(self, depth=0):
rstr = " " * depth * 2 + self.word + " " + str(len(self.files))
rstr = rstr + "\n" + "\n".join([v.__str__(depth + 1) for v in self.index.itervalues()])
return rstr
def write_list(self, list_handle):
self.start_loc = list_handle.tell()
for filename in self.files:
list_handle.write("%s %s\n" % (self.word, filename))
for entry in self.index.itervalues():
entry.write_list(list_handle)
self.end_loc = list_handle.tell()
def write_index(self, index_handle):
# How much space will this particular index take up?
# TODO magic nums
index_size = 18 * len(self.index.values()) + 34
index_loc = index_handle.tell()
# reserve space for this index
index_handle.write(" " * index_size)
addresses = {}
for char, entry in self.index.iteritems():
addresses[char] = index_handle.tell()
entry.write_index()
# now write the index to the reserved space
tmp = index_handle.tell()
index_handle.seek(index_loc, 0)
# TODO this should be a byte-value, not a string. Idiot.
index_handle.write(" ".join(["%s%16d" % pair for pair in address.iteritems()]))
index_handle.write(" %32d %32d\n" % self.start_loc, self.end_loc)
def build_index(filenames, depth):
index = IndexEntry("")
for filename in filenames:
handle = open(filename);
for word in words_in_file(handle, depth):
index.add_file_word(filename, word)
return index
def list_files(root, depth):
# Silly and HACK &c.
return glob.glob(root + ("/*" * depth))
if __name__ == '__main__':
files = list_files("/home/diiq/Downloads/enron_mail_20110402/maildir", 3)
ind = build_index(files[:100], 3);
outfile = "foo.lst"
indfile = "foo.ind"
ind.write_list(open(outfile, 'w'), None)
ind.write_(open(outfile, 'w'), None)
# TOTALLY a sketch. Incomplete, untested, and DOES NOT currently match output format of build_index.
# a sketch ONLY.
"""Searches a character/word indexed tree of files for a given string."""
# File of indices (smaller)
# header_length charloc charloc... loc
# File of documents
# word filename\n...
# OK, in truth, you'd want multiple file pointers, and some stuff, but it's a place to start.
# TODO: print the words for the results.
def load_index(handle):
header_len = handle.read(1)
header = handle.read(header_len)
index = {}
for pair in index.rsplit():
index[pair[0]] = int(pair[1:])
return index
def next_index(index, letter, handle):
if not letter in index:
return False
handle.seek(index[letter].location)
return load_index(handle)
# Doesn't reset handle's location.
def lookup_partial(word, handle):
index = load_index(handle)
for letter in word:
index = next_index(loc, letter)
if not index:
return False
return index
def list_results(index, handle):
# First pick out the results that match exactly
# Bang for word as-is
handle.seek(index["!"])
handle.readline(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment