dwickwire/spelling.py

## spelling.py
# Probability of a spelling correction, c =
#   Probability(c is a word) *
#       Probability(original is a typo for c)
# Best correction =
#   one with highest probability
# Probability(c is a word) =
#   estimated by counting
# Probability(original is a typo for c) =
#   proportional to number of changes

# correction("thew") = ?
# Compare:
# P("the" is a word) * P("thew" typo "the")
# P("thaw" is a word) * P("thew" typo "thaw")
# ...

import string, collections


def train(filename):
    P = collections.defaultdict(lamba: 1)
    for line in file(filename):
        word, count = line.split()
        P[word] = int(count)
    return P


P = train('en100k.txt')


def edits1(word):
    n = len(word)
    return set([word[0:i]+word[i+1:] for i in range(n)] + # deletion
            [word[0:i]+word[i]+word[i+2:] for i in range(n-1) + # transposition
            [word[0:i]+c+word[i+1:] for i in range(n) for c in string.lowercase] + # alteration
            [word[0:i]+c+word[i:] for i in range(n+1) for c in string.lowercase]) # insertion


def known_edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in P)


def known(words):
    return set(w for w in words if w in P)


def correct(word):
    candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
    return argmax(candidates, P)
	# Probability of a spelling correction, c =
	# Probability(c is a word) *
	# Probability(original is a typo for c)
	# Best correction =
	# one with highest probability
	# Probability(c is a word) =
	# estimated by counting
	# Probability(original is a typo for c) =
	# proportional to number of changes

	# correction("thew") = ?
	# Compare:
	# P("the" is a word) * P("thew" typo "the")
	# P("thaw" is a word) * P("thew" typo "thaw")
	# ...

	import string, collections


	def train(filename):
	P = collections.defaultdict(lamba: 1)
	for line in file(filename):
	word, count = line.split()
	P[word] = int(count)
	return P


	P = train('en100k.txt')


	def edits1(word):
	n = len(word)
	return set([word[0:i]+word[i+1:] for i in range(n)] + # deletion
	[word[0:i]+word[i]+word[i+2:] for i in range(n-1) + # transposition
	[word[0:i]+c+word[i+1:] for i in range(n) for c in string.lowercase] + # alteration
	[word[0:i]+c+word[i:] for i in range(n+1) for c in string.lowercase]) # insertion


	def known_edits2(word):
	return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in P)


	def known(words):
	return set(w for w in words if w in P)


	def correct(word):
	candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
	return argmax(candidates, P)