atomictom/sentence-parse.py

## sentence-parse.py
#!/usr/bin/python

import os
import sys

class Node(object):

    def __init__(self):
        self.is_word = False
        self.children = {}

class Trie(object):

    def __init__(self, words):
        self.root = Node()
        for word in words:
            self.add_word(word)

    def add_word(self, word):
        cur_node = self.root
        for c in word:
            if not c in cur_node.children:
                new_node = Node()
                cur_node.children[c] = new_node
            cur_node = cur_node.children[c]
        cur_node.is_word = True

    def is_word(self, word):
        cur_node = self.root
        for c in word:
            if c not in cur_node.children:
                return False
            cur_node = cur_node.children[c]
        return cur_node.is_word

# parse_words :: Trie -> String -> [(String, String)]
def parse_words(dictionary, sentence):
    cur_node = dictionary.root
    cur_word = ""

    if not sentence:
        return [("", "")]

    possible_words = []
    for i, c in enumerate(sentence):
        if c not in cur_node.children:
            break
        cur_word += c
        cur_node = cur_node.children[c]
        if cur_node.is_word:
            word = cur_word
            remaining = sentence[i+1:]
            possible_words.append((word, remaining))

    return possible_words

# parse_ngram :: Trie -> Int -> String -> [([String], String)]
def parse_ngram(dictionary, n, sentence):
    if n <= 0:
        return [([], sentence)]

    ngrams = []
    for word, remaining in parse_words(dictionary, sentence):
        possible_ngrams = parse_ngram(dictionary, n-1, remaining)
        new_word = ([word] if word else [])
        ngrams.extend([(new_word + words, ngram_remaining) for words, ngram_remaining in possible_ngrams])
    return ngrams

def beam_parse(sentence, dictionary, ngram_freqs, beam_width=5):
    heuristic = lambda words: score_words(ngram_freqs, words)
    top_candidates = [([], sentence)]

    while any(remaining for _, remaining in top_candidates):
        next_candidates = []
        for words, remaining in top_candidates:
            parses = parse_ngram(dictionary, 5, remaining)
            next_candidates.extend((words + new_words, new_remaining) for new_words, new_remaining in parses)

        ranked = [(heuristic(words), words, remaining) for words, remaining in next_candidates]
        top_ranked = sorted(ranked, reverse=True)[:beam_width]
        top_candidates = [(words, remaining) for _, words, remaining in top_ranked]
        print "\n".join(map(str, top_candidates))

    return [' '.join(words) for words, remaining in top_candidates]


def score_words(ngram_freqs, words):
    score = 0

    for i in range(len(words)):
        for j in range(i, len(words)):
            key = tuple(words[i:j+1])
            if 5 >= len(key) > 1 and key in ngram_freqs:
                modifier = reduce(lambda x, y: x * y, [get_word_len_freqs().get(len(word), 0.001) for word in key])
                score += ngram_freqs[key] * modifier
    return score

def get_words():
    with open("/usr/share/dict/words") as fin:
        words = [word.strip().lower() for word in fin.readlines()]
    return words

def get_ngrams_freqs_upto(n):
    ngram_freqs = {}
    for i in range(1, n+1):
        with open(str(i) + "gram-freq.txt") as fin:
            for line in fin.readlines():
                fields = line.split()
                freq = fields[0]
                words = fields[1:]
                ngram_freqs[tuple(words)] = int(freq)
    return ngram_freqs

def get_word_len_freqs():
    return {
        1: .027,
        2: .679,
        3: 4.730,
        4: 7.151,
        5: 10.804,
        6: 13.674,
        7: 14.751,
        8: 13.616,
        9: 11.356,
        10: 8.679,
        11: 5.913,
        12: 3.792,
        13: 2.329,
        14: 1.232,
        15: 0.685,
        16: 0.290,
        17: 0.162,
        18: 0.066,
        19: 0.041
    }

def main():
    sys.setrecursionlimit(100000)

    sentence = sys.argv[1].lower()
    words = get_words()
    ngram_freqs = get_ngrams_freqs_upto(5)
    dictionary = Trie(words)

    parses = beam_parse(sentence, dictionary, ngram_freqs)
    for p in parses:
        print p

if __name__ == "__main__":
    main()
	#!/usr/bin/python

	import os
	import sys

	class Node(object):

	def __init__(self):
	self.is_word = False
	self.children = {}

	class Trie(object):

	def __init__(self, words):
	self.root = Node()
	for word in words:
	self.add_word(word)

	def add_word(self, word):
	cur_node = self.root
	for c in word:
	if not c in cur_node.children:
	new_node = Node()
	cur_node.children[c] = new_node
	cur_node = cur_node.children[c]
	cur_node.is_word = True

	def is_word(self, word):
	cur_node = self.root
	for c in word:
	if c not in cur_node.children:
	return False
	cur_node = cur_node.children[c]
	return cur_node.is_word

	# parse_words :: Trie -> String -> [(String, String)]
	def parse_words(dictionary, sentence):
	cur_node = dictionary.root
	cur_word = ""

	if not sentence:
	return [("", "")]

	possible_words = []
	for i, c in enumerate(sentence):
	if c not in cur_node.children:
	break
	cur_word += c
	cur_node = cur_node.children[c]
	if cur_node.is_word:
	word = cur_word
	remaining = sentence[i+1:]
	possible_words.append((word, remaining))

	return possible_words

	# parse_ngram :: Trie -> Int -> String -> [([String], String)]
	def parse_ngram(dictionary, n, sentence):
	if n <= 0:
	return [([], sentence)]

	ngrams = []
	for word, remaining in parse_words(dictionary, sentence):
	possible_ngrams = parse_ngram(dictionary, n-1, remaining)
	new_word = ([word] if word else [])
	ngrams.extend([(new_word + words, ngram_remaining) for words, ngram_remaining in possible_ngrams])
	return ngrams

	def beam_parse(sentence, dictionary, ngram_freqs, beam_width=5):
	heuristic = lambda words: score_words(ngram_freqs, words)
	top_candidates = [([], sentence)]

	while any(remaining for _, remaining in top_candidates):
	next_candidates = []
	for words, remaining in top_candidates:
	parses = parse_ngram(dictionary, 5, remaining)
	next_candidates.extend((words + new_words, new_remaining) for new_words, new_remaining in parses)

	ranked = [(heuristic(words), words, remaining) for words, remaining in next_candidates]
	top_ranked = sorted(ranked, reverse=True)[:beam_width]
	top_candidates = [(words, remaining) for _, words, remaining in top_ranked]
	print "\n".join(map(str, top_candidates))

	return [' '.join(words) for words, remaining in top_candidates]


	def score_words(ngram_freqs, words):
	score = 0

	for i in range(len(words)):
	for j in range(i, len(words)):
	key = tuple(words[i:j+1])
	if 5 >= len(key) > 1 and key in ngram_freqs:
	modifier = reduce(lambda x, y: x * y, [get_word_len_freqs().get(len(word), 0.001) for word in key])
	score += ngram_freqs[key] * modifier
	return score

	def get_words():
	with open("/usr/share/dict/words") as fin:
	words = [word.strip().lower() for word in fin.readlines()]
	return words

	def get_ngrams_freqs_upto(n):
	ngram_freqs = {}
	for i in range(1, n+1):
	with open(str(i) + "gram-freq.txt") as fin:
	for line in fin.readlines():
	fields = line.split()
	freq = fields[0]
	words = fields[1:]
	ngram_freqs[tuple(words)] = int(freq)
	return ngram_freqs

	def get_word_len_freqs():
	return {
	1: .027,
	2: .679,
	3: 4.730,
	4: 7.151,
	5: 10.804,
	6: 13.674,
	7: 14.751,
	8: 13.616,
	9: 11.356,
	10: 8.679,
	11: 5.913,
	12: 3.792,
	13: 2.329,
	14: 1.232,
	15: 0.685,
	16: 0.290,
	17: 0.162,
	18: 0.066,
	19: 0.041
	}

	def main():
	sys.setrecursionlimit(100000)

	sentence = sys.argv[1].lower()
	words = get_words()
	ngram_freqs = get_ngrams_freqs_upto(5)
	dictionary = Trie(words)

	parses = beam_parse(sentence, dictionary, ngram_freqs)
	for p in parses:
	print p

	if __name__ == "__main__":
	main()