Skip to content

Instantly share code, notes, and snippets.

@corbanbrook
Created October 26, 2009 18:06
Show Gist options
  • Save corbanbrook/218883 to your computer and use it in GitHub Desktop.
Save corbanbrook/218883 to your computer and use it in GitHub Desktop.
Spell Correct by Peter Norvig
# Didn't realize this but Brian Adkins wrote a ruby version as well, code is almost identical to my own.
def words text
text.downcase.scan(/[a-z]+/)
end
def train features
model = Hash.new(1)
features.each {|f| model[f] += 1 }
return model
end
NWORDS = train(words(File.new('holmes.txt').read))
LETTERS = ("a".."z").to_a.join
def edits1 word
n = word.length
deletion = (0...n).collect {|i| word[0...i]+word[i+1..-1] }
transposition = (0...n-1).collect {|i| word[0...i]+word[i+1,1]+word[i,1]+word[i+2..-1] }
alteration = []
n.times {|i| LETTERS.each_byte {|l| alteration << word[0...i]+l.chr+word[i+1..-1] } }
insertion = []
(n+1).times {|i| LETTERS.each_byte {|l| insertion << word[0...i]+l.chr+word[i..-1] } }
result = deletion + transposition + alteration + insertion
result.empty? ? nil : result
end
def known_edits2 word
result = []
edits1(word).each {|e1| edits1(e1).each {|e2| result << e2 if NWORDS.has_key?(e2) }}
result.empty? ? nil : result
end
def known words
result = words.find_all {|w| NWORDS.has_key?(w) }
result.empty? ? nil : result
end
def correct word
(known([word]) or known(edits1(word)) or known_edits2(word) or
[word]).max {|a,b| NWORDS[a] <=> NWORDS[b] }
end
# Jordan Baker's Python Presentation: http://www.slideshare.net/hexsprite/a-taste-of-python-devdays-toronto-2009
# 23 lines of Python code by Peter Norvig http://norvig.com/spell-correct.html
import re, collections
def words(text):
return re.findall('[a-z]+', text.lower())
def train(features):
model = collections.defaultdict(lambda: 1)
for f in features:
model[f] += 1
return model
NWORDS = train(words(file('big.txt').read()))
alphabet = 'abcdefghijklmnopqrstuvwxyz'
def edits1(word):
s = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [a + b[1:] for a, b in s if b]
transposes = [a + b[1] + b[0] + b[2:] for a, b in s if len(b)>1]
replaces = [a + c + b[1:] for a, b in s for c in alphabet if b]
inserts = [a + c + b for a, b in s for c in alphabet]
return set(deletes + transposes + replaces + inserts)
def known_edits2(word):
return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)
def known(words): return set(w for w in words if w in NWORDS)
def correct(word):
candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
return max(candidates, key=NWORDS.get)
# 30 lines of Ruby code by Corban Brook (24 lines exluding 'end')
def words(text)
text.downcase.scan /[a-z]+/
end
def train(words)
model = Hash.new(0)
words.each { |w| model[w] += 1 }
model
end
NWORDS = train(words(File.read('big.txt')))
ALPHABET = *('a'..'z')
def edits1(word)
s = (0..word.length).collect { |i| [word[0, i], word[i, word.length]] }
deletes = s.collect { |a, b| a + b[1, b.length] if b.any? }.compact
transposes = s.collect { |a, b| a + b[1, 1] + b[0, 1] + b[2, b.length] if b.length > 1 }.compact
replaces = s.collect { |a, b| ALPHABET.collect { |c| a + c + b[1, b.length] if b.any? } }.flatten.compact
inserts = s.collect { |a, b| ALPHABET.collect { |c| a + c + b } }.flatten
deletes + transposes + replaces + inserts
end
def known_edits2(word)
result = []
edits1(word).each { |e1| result += edits1(e1).select { |e2| NWORDS.has_key? e2 } }
result.empty? ? nil : result.uniq
end
def known(words)
result = words.select { |w| NWORDS.has_key? w }
result.empty? ? nil : result
end
def correct(word)
(known([word]) or known(edits1(word)) or known_edits2(word) or [word]).max { |a, b| NWORDS[a] <=> NWORDS[b] }
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment