Skip to content

Instantly share code, notes, and snippets.

@joehillen
Created July 11, 2013 07:40
Show Gist options
  • Save joehillen/5973352 to your computer and use it in GitHub Desktop.
Save joehillen/5973352 to your computer and use it in GitHub Desktop.
A crappy spellchecker implementation I wrote for a job interview. I didn't get the job =P
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Written by Joe Hillenbrand <joehillen@gmail.com> 2012
All rights reserved!
"""
import string
import re
import sys
def spellcheck1(word):
'''
The following is my first attempt.
The idea was to generate a regular expression for the given word and return
the first match found.
Unfortnately, this would give bad results for already correct words or less
than ideal matches. For example:
> sheeeeep
shap
> sheeple
shiploa
> jjoooobb
jab
> FOOD
fad
This method gives no way to compare possible matches and determine the best
one.
I'm leaving this here to demonstrate my thought process.
The following are tests using `doctest`. To run the tests do:
python spellcheck.py -t
TESTS
>>> spellcheck1('sandwich')
'sandwich'
>>> spellcheck1('little')
'little'
>>> spellcheck1('sheeeeep')
'sheep'
>>> spellcheck1('peepple')
'people'
>>> spellcheck1('sheeple')
'NO SUGGESTION'
>>> spellcheck1('inSIDE')
'inside'
>>> spellcheck1('jjoobbb')
'jab'
>>> spellcheck1('weke')
'wake'
>>> spellcheck1('CUNsperrICY')
'conspiracy'
>>> spellcheck1('ffoaoaoaoaoaoaaoaoaoaoaoadd')
'food'
>>> spellcheck1('FOOD')
'food'
PATHOLOGICAL TESTS
>>> spellcheck1('')
'NO SUGGESTION'
>>> spellcheck1('1337')
'NO SUGGESTION'
>>> spellcheck1('woot!')
'NO SUGGESTION'
>>> spellcheck1('two words')
'NO SUGGESTION'
>>> spellcheck1(u'ಠ_ಠ')
'NO SUGGESTION'
>>> spellcheck1('fo0bar')
'NO SUGGESTION'
'''
vowels = 'aeiou' # let's not consider 'y' a vowel in this case
# generate all the consonants
consonants = [c for c in string.ascii_lowercase if c not in vowels]
'''
The plan is to generate to a regular expression (wordre) that can be used to
search the dictionary.
'''
wordre = ''
prev_ltr = ''
word = word.lower()
for ltr in word:
ltr = ltr.lower()
if ltr in consonants:
if ltr != prev_ltr:
wordre = ''.join([wordre, ltr, '+'])
prev_ltr = ltr
elif ltr in vowels:
if prev_ltr not in vowels or prev_ltr == '':
wordre = ''.join([wordre, '[', vowels, ']+'])
prev_ltr = ltr
else:
return 'NO SUGGESTION'
if len(wordre) <= 0:
return 'NO SUGGESTION'
regex = re.compile(wordre)
with open('/usr/share/dict/words','r') as words:
for w in words:
m = regex.match(w)
if m is not None:
return m.group(0)
return 'NO SUGGESTION'
class SpellCheck:
'''
This is my second attempt.
It preloads the dictionary into a set for faster searching and uses
a breadth first search algorithm.
I am happier with its answers, but it can consume endless memory on
pathologically long cases that have lots of vowels. For example,
wwoweyuoaweoaueaowewasoiue
I'd love to work on this some more, but I've used up all my time.
I think I would like to try a solution that combines the 2 methods next,
or a version of the first attempt that grabs all the valid words and
determines the best one.
'''
def __init__(self):
'''
Pre-generate a word list so we don't do it at every attempt.
Use a set() because it has constant (O(1)) lookup time.
http://wiki.python.org/moin/PythonSpeed#Use_the_best_algorithms_and_fastest_tools
'''
self.words = set()
with open('/usr/share/dict/words','r') as f:
for word in f:
self.words.add(word.strip())
def gen_possible(self, word):
'''
Generator for all the possible misspellings of "word".
This is implemented as a breadth first search.
This should also count as the extra credit.
'''
# First try the word itself
yield word
# Next, try it in all lowercase
word = word.lower()
vowels = 'aeiou' # let's not consider 'y' a vowel in this case
revowel = re.compile(r'[aeiou]')
repeat = re.compile(r'([a-z])\1')
visited = set()
possible = [word]
while len(possible) > 0:
word = possible.pop()
yield word
if word not in visited:
# Now try replacing vowels
matches = revowel.finditer(word)
for m in matches: # m is for "match"
s = word[:m.start(0)] # s is for "start"
e = word[m.end(0):] # e is for "end"
for v in vowels: # v is for "vowel"
w = ''.join([s, v, e]) # w is for "word"
if w not in possible:
possible.append(w)
# Try removing duplicates
matches = repeat.finditer(word)
for m in matches:
s = word[:m.start(0)] # s is for "start"
e = word[m.end(0)-1:] # e is for "end"
w = ''.join([s, e]) # w is for "word"
if w not in possible:
possible.append(w)
visited.add(word)
return
def spellcheck(self, word):
'''
The following are tests using `doctest`. To run the tests do:
python spellcheck.py -t
TESTS
>>> s.spellcheck('sandwich')
'sandwich'
>>> s.spellcheck('little')
'little'
>>> s.spellcheck('sheeeeep')
'sheep'
>>> s.spellcheck('peepple')
'people'
>>> s.spellcheck('sheeple')
'NO SUGGESTION'
>>> s.spellcheck('inSIDE')
'inside'
>>> s.spellcheck('jjoobbb')
'job'
>>> s.spellcheck('weke')
'wiki'
>>> s.spellcheck('CUNsperrICY')
'conspiracy'
>>> s.spellcheck('FOOD')
'food'
PATHOLOGICAL TESTS
>>> s.spellcheck('')
'NO SUGGESTION'
>>> s.spellcheck('1337')
'NO SUGGESTION'
>>> s.spellcheck('woot!')
'NO SUGGESTION'
>>> s.spellcheck('two words')
'NO SUGGESTION'
>>> s.spellcheck(u'ಠ_ಠ')
'NO SUGGESTION'
>>> s.spellcheck('fo0bar')
'NO SUGGESTION'
'''
for w in self.gen_possible(word):
if w in self.words:
return w
return 'NO SUGGESTION'
if __name__ == '__main__':
s = SpellCheck()
if len(sys.argv) > 1 and sys.argv[1] == '-t':
import doctest
doctest.testmod(extraglobs={'s': s })
sys.exit(0)
while True:
word = raw_input('> ')
print s.spellcheck(word)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment