diiq/build_index.py

## build_index.py
"""Builds a character/word-indexed tree of files."""

import itertools
import glob

punctuation = ".,;:/?<>{}[]()-_=+!@#$%^&*\/\\~*" # TODO be more systematic


def words_in_file(handle, max_len):
    """Returns an alphabetized list of unique words in the file.

    handle is a file handle with read access
    max_len is an int, the max length of word we're interested in indexing.
    """
    lines = handle.readlines()
    words = itertools.chain(*[line.rstrip().rsplit(" ") for line in lines])
    words = (word.rstrip(punctuation).lstrip(punctuation)[:max_len].upper() for word in words)
    return sorted(list(set(words)))

# OK, now what I'm after is adding a file to the index in all appropriate places.

# I'm going to start by being dumb, and just have the whole index floating in RAM on this end.


class IndexEntry(object):
    def __init__(self, word):
        self.word = word
        # index is a dictionary of characters to IndexEntries
        self.index = {}
        # files is a list of filenames which contain the word indexed by IndexEntry.
        self.files = []

    def add_file_word(self, filename, word):
        index = self
        if word == "":
            return
        for i, letter in enumerate(word):
            if not letter in index.index:
                index.index[letter] = IndexEntry(word)
            index = index.index[letter]
        index.files.append(filename);

    def __str__(self, depth=0):
        rstr = " " * depth * 2 + self.word + " " + str(len(self.files))
        rstr = rstr + "\n" + "\n".join([v.__str__(depth + 1) for v in self.index.itervalues()])
        return rstr

    def write_list(self, list_handle):
        self.start_loc = list_handle.tell()
        for filename in self.files:
            list_handle.write("%s %s\n" % (self.word, filename))
        for entry in self.index.itervalues():
            entry.write_list(list_handle)
        self.end_loc = list_handle.tell()

    def write_index(self, index_handle):
        # How much space will this particular index take up?
        # TODO magic nums
        index_size = 18 * len(self.index.values()) + 34
        index_loc = index_handle.tell()
        # reserve space for this index
        index_handle.write(" " * index_size)
        addresses = {}
        for char, entry in self.index.iteritems():
            addresses[char] = index_handle.tell()
            entry.write_index()
        # now write the index to the reserved space
        tmp = index_handle.tell()
        index_handle.seek(index_loc, 0)
        # TODO this should be a byte-value, not a string. Idiot.
        index_handle.write(" ".join(["%s%16d" % pair for pair in address.iteritems()]))
        index_handle.write(" %32d %32d\n" % self.start_loc, self.end_loc)

def build_index(filenames, depth):
    index = IndexEntry("")
    for filename in filenames:
        handle = open(filename);
        for word in words_in_file(handle, depth):
            index.add_file_word(filename, word)
    return index


def list_files(root, depth):
    # Silly and HACK &c.
    return glob.glob(root + ("/*" * depth))


if __name__ == '__main__':
    files = list_files("/home/diiq/Downloads/enron_mail_20110402/maildir", 3)
    ind =  build_index(files[:100], 3);
    outfile = "foo.lst"
    indfile = "foo.ind"
    ind.write_list(open(outfile, 'w'), None)
    ind.write_(open(outfile, 'w'), None)

## search.py
# TOTALLY a sketch. Incomplete, untested, and DOES NOT currently match output format of build_index.
# a sketch ONLY.

"""Searches a character/word indexed tree of files for a given string."""

# File of indices (smaller)
# header_length charloc charloc... loc

# File of documents
# word filename\n...

# OK, in truth, you'd want multiple file pointers, and some stuff, but it's a place to start.
# TODO: print the words for the results.

def load_index(handle):
    header_len = handle.read(1)
    header = handle.read(header_len)
    index = {}
    for pair in index.rsplit():
        index[pair[0]] = int(pair[1:])
    return index

def next_index(index, letter, handle):
    if not letter in index:
        return False
    handle.seek(index[letter].location)
    return load_index(handle)

# Doesn't reset handle's location.
def lookup_partial(word, handle):
    index = load_index(handle)
    for letter in word:
        index = next_index(loc, letter)
        if not index:
            return False
    return index

def list_results(index, handle):
    # First pick out the results that match exactly
    # Bang for word as-is
    handle.seek(index["!"])
    handle.readline(1)
	"""Builds a character/word-indexed tree of files."""

	import itertools
	import glob

	punctuation = ".,;:/?<>{}[]()-_=+!@#$%^&\/\\~" # TODO be more systematic


	def words_in_file(handle, max_len):
	"""Returns an alphabetized list of unique words in the file.

	handle is a file handle with read access
	max_len is an int, the max length of word we're interested in indexing.
	"""
	lines = handle.readlines()
	words = itertools.chain(*[line.rstrip().rsplit(" ") for line in lines])
	words = (word.rstrip(punctuation).lstrip(punctuation)[:max_len].upper() for word in words)
	return sorted(list(set(words)))

	# OK, now what I'm after is adding a file to the index in all appropriate places.

	# I'm going to start by being dumb, and just have the whole index floating in RAM on this end.


	class IndexEntry(object):
	def __init__(self, word):
	self.word = word
	# index is a dictionary of characters to IndexEntries
	self.index = {}
	# files is a list of filenames which contain the word indexed by IndexEntry.
	self.files = []

	def add_file_word(self, filename, word):
	index = self
	if word == "":
	return
	for i, letter in enumerate(word):
	if not letter in index.index:
	index.index[letter] = IndexEntry(word)
	index = index.index[letter]
	index.files.append(filename);

	def __str__(self, depth=0):
	rstr = " " * depth * 2 + self.word + " " + str(len(self.files))
	rstr = rstr + "\n" + "\n".join([v.__str__(depth + 1) for v in self.index.itervalues()])
	return rstr

	def write_list(self, list_handle):
	self.start_loc = list_handle.tell()
	for filename in self.files:
	list_handle.write("%s %s\n" % (self.word, filename))
	for entry in self.index.itervalues():
	entry.write_list(list_handle)
	self.end_loc = list_handle.tell()

	def write_index(self, index_handle):
	# How much space will this particular index take up?
	# TODO magic nums
	index_size = 18 * len(self.index.values()) + 34
	index_loc = index_handle.tell()
	# reserve space for this index
	index_handle.write(" " * index_size)
	addresses = {}
	for char, entry in self.index.iteritems():
	addresses[char] = index_handle.tell()
	entry.write_index()
	# now write the index to the reserved space
	tmp = index_handle.tell()
	index_handle.seek(index_loc, 0)
	# TODO this should be a byte-value, not a string. Idiot.
	index_handle.write(" ".join(["%s%16d" % pair for pair in address.iteritems()]))
	index_handle.write(" %32d %32d\n" % self.start_loc, self.end_loc)

	def build_index(filenames, depth):
	index = IndexEntry("")
	for filename in filenames:
	handle = open(filename);
	for word in words_in_file(handle, depth):
	index.add_file_word(filename, word)
	return index


	def list_files(root, depth):
	# Silly and HACK &c.
	return glob.glob(root + ("/" depth))


	if __name__ == '__main__':
	files = list_files("/home/diiq/Downloads/enron_mail_20110402/maildir", 3)
	ind = build_index(files[:100], 3);
	outfile = "foo.lst"
	indfile = "foo.ind"
	ind.write_list(open(outfile, 'w'), None)
	ind.write_(open(outfile, 'w'), None)
	# TOTALLY a sketch. Incomplete, untested, and DOES NOT currently match output format of build_index.
	# a sketch ONLY.

	"""Searches a character/word indexed tree of files for a given string."""

	# File of indices (smaller)
	# header_length charloc charloc... loc

	# File of documents
	# word filename\n...

	# OK, in truth, you'd want multiple file pointers, and some stuff, but it's a place to start.
	# TODO: print the words for the results.

	def load_index(handle):
	header_len = handle.read(1)
	header = handle.read(header_len)
	index = {}
	for pair in index.rsplit():
	index[pair[0]] = int(pair[1:])
	return index

	def next_index(index, letter, handle):
	if not letter in index:
	return False
	handle.seek(index[letter].location)
	return load_index(handle)

	# Doesn't reset handle's location.
	def lookup_partial(word, handle):
	index = load_index(handle)
	for letter in word:
	index = next_index(loc, letter)
	if not index:
	return False
	return index

	def list_results(index, handle):
	# First pick out the results that match exactly
	# Bang for word as-is
	handle.seek(index["!"])
	handle.readline(1)