Skip to content

Instantly share code, notes, and snippets.

@dwickwire
Created January 28, 2014 05:39
Show Gist options
  • Save dwickwire/8662788 to your computer and use it in GitHub Desktop.
Save dwickwire/8662788 to your computer and use it in GitHub Desktop.
Spelling algorithm by Peter Norvig.
# Probability of a spelling correction, c =
# Probability(c is a word) *
# Probability(original is a typo for c)
# Best correction =
# one with highest probability
# Probability(c is a word) =
# estimated by counting
# Probability(original is a typo for c) =
# proportional to number of changes
# correction("thew") = ?
# Compare:
# P("the" is a word) * P("thew" typo "the")
# P("thaw" is a word) * P("thew" typo "thaw")
# ...
import string, collections
def train(filename):
P = collections.defaultdict(lamba: 1)
for line in file(filename):
word, count = line.split()
P[word] = int(count)
return P
P = train('en100k.txt')
def edits1(word):
n = len(word)
return set([word[0:i]+word[i+1:] for i in range(n)] + # deletion
[word[0:i]+word[i]+word[i+2:] for i in range(n-1) + # transposition
[word[0:i]+c+word[i+1:] for i in range(n) for c in string.lowercase] + # alteration
[word[0:i]+c+word[i:] for i in range(n+1) for c in string.lowercase]) # insertion
def known_edits2(word):
return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in P)
def known(words):
return set(w for w in words if w in P)
def correct(word):
candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
return argmax(candidates, P)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment