joehillen/bad_spellcheckers.py

## bad_spellcheckers.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Written by Joe Hillenbrand <joehillen@gmail.com> 2012

All rights reserved!
"""

import string
import re
import sys

def spellcheck1(word):
    '''
    The following is my first attempt.

    The idea was to generate a regular expression for the given word and return
    the first match found.

    Unfortnately, this would give bad results for already correct words or less
    than ideal matches. For example:

        > sheeeeep
        shap
        > sheeple
        shiploa
        > jjoooobb
        jab
        > FOOD
        fad

    This method gives no way to compare possible matches and determine the best
    one.

    I'm leaving this here to demonstrate my thought process.

    The following are tests using `doctest`. To run the tests do:

        python spellcheck.py -t

    TESTS
    >>> spellcheck1('sandwich')
    'sandwich'
    >>> spellcheck1('little')
    'little'
    >>> spellcheck1('sheeeeep')
    'sheep'
    >>> spellcheck1('peepple')
    'people'
    >>> spellcheck1('sheeple')
    'NO SUGGESTION'
    >>> spellcheck1('inSIDE')
    'inside'
    >>> spellcheck1('jjoobbb')
    'jab'
    >>> spellcheck1('weke')
    'wake'
    >>> spellcheck1('CUNsperrICY')
    'conspiracy'
    >>> spellcheck1('ffoaoaoaoaoaoaaoaoaoaoaoadd')
    'food'
    >>> spellcheck1('FOOD')
    'food'

    PATHOLOGICAL TESTS
    >>> spellcheck1('')
    'NO SUGGESTION'
    >>> spellcheck1('1337')
    'NO SUGGESTION'
    >>> spellcheck1('woot!')
    'NO SUGGESTION'
    >>> spellcheck1('two words')
    'NO SUGGESTION'
    >>> spellcheck1(u'ಠ_ಠ')
    'NO SUGGESTION'
    >>> spellcheck1('fo0bar')
    'NO SUGGESTION'
    '''

    vowels = 'aeiou' # let's not consider 'y' a vowel in this case

    # generate all the consonants
    consonants = [c for c in string.ascii_lowercase if c not in vowels]


    '''
    The plan is to generate to a regular expression (wordre) that can be used to
    search the dictionary.
    '''
    wordre = ''
    prev_ltr = ''

    word = word.lower()
    for ltr in word:
        ltr = ltr.lower()
        if ltr in consonants:
            if ltr != prev_ltr:
                wordre = ''.join([wordre, ltr, '+'])
                prev_ltr = ltr
        elif ltr in vowels:
            if prev_ltr not in vowels or prev_ltr == '':
                wordre = ''.join([wordre, '[', vowels, ']+'])
                prev_ltr = ltr
        else:
            return 'NO SUGGESTION'

    if len(wordre) <= 0:
        return 'NO SUGGESTION'

    regex = re.compile(wordre)
    with open('/usr/share/dict/words','r') as words:
        for w in words:
            m = regex.match(w)
            if m is not None:
                return m.group(0)

    return 'NO SUGGESTION'


class SpellCheck:
    '''
    This is my second attempt.

    It preloads the dictionary into a set for faster searching and uses
    a breadth first search algorithm.

    I am happier with its answers, but it can consume endless memory on
    pathologically long cases that have lots of vowels. For example,
      wwoweyuoaweoaueaowewasoiue

    I'd love to work on this some more, but I've used up all my time.

    I think I would like to try a solution that combines the 2 methods next,
    or a version of the first attempt that grabs all the valid words and
    determines the best one.
    '''

    def __init__(self):
        '''
        Pre-generate a word list so we don't do it at every attempt.

        Use a set() because it has constant (O(1)) lookup time.
        http://wiki.python.org/moin/PythonSpeed#Use_the_best_algorithms_and_fastest_tools
        '''
        self.words = set()
        with open('/usr/share/dict/words','r') as f:
            for word in f:
                self.words.add(word.strip())

    def gen_possible(self, word):
        '''
        Generator for all the possible misspellings of "word".

        This is implemented as a breadth first search.

        This should also count as the extra credit.
        '''

        # First try the word itself
        yield word

        # Next, try it in all lowercase
        word = word.lower()

        vowels = 'aeiou' # let's not consider 'y' a vowel in this case
        revowel = re.compile(r'[aeiou]')
        repeat = re.compile(r'([a-z])\1')

        visited = set()
        possible = [word]
        while len(possible) > 0:
            word = possible.pop()
            yield word

            if word not in visited:
                # Now try replacing vowels
                matches = revowel.finditer(word)
                for m in matches: # m is for "match"
                    s = word[:m.start(0)] # s is for "start"
                    e = word[m.end(0):] # e is for "end"
                    for v in vowels: # v is for "vowel"
                        w = ''.join([s, v, e]) # w is for "word"
                        if w not in possible:
                            possible.append(w)

                # Try removing duplicates
                matches = repeat.finditer(word)
                for m in matches:
                    s = word[:m.start(0)] # s is for "start"
                    e = word[m.end(0)-1:] # e is for "end"
                    w = ''.join([s, e]) # w is for "word"
                    if w not in possible:
                        possible.append(w)

            visited.add(word)
        return


    def spellcheck(self, word):
        '''
        The following are tests using `doctest`. To run the tests do:

            python spellcheck.py -t

        TESTS
        >>> s.spellcheck('sandwich')
        'sandwich'
        >>> s.spellcheck('little')
        'little'
        >>> s.spellcheck('sheeeeep')
        'sheep'
        >>> s.spellcheck('peepple')
        'people'
        >>> s.spellcheck('sheeple')
        'NO SUGGESTION'
        >>> s.spellcheck('inSIDE')
        'inside'
        >>> s.spellcheck('jjoobbb')
        'job'
        >>> s.spellcheck('weke')
        'wiki'
        >>> s.spellcheck('CUNsperrICY')
        'conspiracy'
        >>> s.spellcheck('FOOD')
        'food'

        PATHOLOGICAL TESTS
        >>> s.spellcheck('')
        'NO SUGGESTION'
        >>> s.spellcheck('1337')
        'NO SUGGESTION'
        >>> s.spellcheck('woot!')
        'NO SUGGESTION'
        >>> s.spellcheck('two words')
        'NO SUGGESTION'
        >>> s.spellcheck(u'ಠ_ಠ')
        'NO SUGGESTION'
        >>> s.spellcheck('fo0bar')
        'NO SUGGESTION'
        '''

        for w in self.gen_possible(word):
            if w in self.words:
                return w

        return 'NO SUGGESTION'


if __name__ == '__main__':

    s = SpellCheck()

    if len(sys.argv) > 1 and sys.argv[1] == '-t':
        import doctest
        doctest.testmod(extraglobs={'s': s })
        sys.exit(0)

    while True:
        word = raw_input('> ')
        print s.spellcheck(word)
	#!/usr/bin/env python
	# -- coding: utf-8 --

	"""
	Written by Joe Hillenbrand <joehillen@gmail.com> 2012

	All rights reserved!
	"""

	import string
	import re
	import sys

	def spellcheck1(word):
	'''
	The following is my first attempt.

	The idea was to generate a regular expression for the given word and return
	the first match found.

	Unfortnately, this would give bad results for already correct words or less
	than ideal matches. For example:

	> sheeeeep
	shap
	> sheeple
	shiploa
	> jjoooobb
	jab
	> FOOD
	fad

	This method gives no way to compare possible matches and determine the best
	one.

	I'm leaving this here to demonstrate my thought process.

	The following are tests using `doctest`. To run the tests do:

	python spellcheck.py -t

	TESTS
	>>> spellcheck1('sandwich')
	'sandwich'
	>>> spellcheck1('little')
	'little'
	>>> spellcheck1('sheeeeep')
	'sheep'
	>>> spellcheck1('peepple')
	'people'
	>>> spellcheck1('sheeple')
	'NO SUGGESTION'
	>>> spellcheck1('inSIDE')
	'inside'
	>>> spellcheck1('jjoobbb')
	'jab'
	>>> spellcheck1('weke')
	'wake'
	>>> spellcheck1('CUNsperrICY')
	'conspiracy'
	>>> spellcheck1('ffoaoaoaoaoaoaaoaoaoaoaoadd')
	'food'
	>>> spellcheck1('FOOD')
	'food'

	PATHOLOGICAL TESTS
	>>> spellcheck1('')
	'NO SUGGESTION'
	>>> spellcheck1('1337')
	'NO SUGGESTION'
	>>> spellcheck1('woot!')
	'NO SUGGESTION'
	>>> spellcheck1('two words')
	'NO SUGGESTION'
	>>> spellcheck1(u'ಠ_ಠ')
	'NO SUGGESTION'
	>>> spellcheck1('fo0bar')
	'NO SUGGESTION'
	'''

	vowels = 'aeiou' # let's not consider 'y' a vowel in this case

	# generate all the consonants
	consonants = [c for c in string.ascii_lowercase if c not in vowels]


	'''
	The plan is to generate to a regular expression (wordre) that can be used to
	search the dictionary.
	'''
	wordre = ''
	prev_ltr = ''

	word = word.lower()
	for ltr in word:
	ltr = ltr.lower()
	if ltr in consonants:
	if ltr != prev_ltr:
	wordre = ''.join([wordre, ltr, '+'])
	prev_ltr = ltr
	elif ltr in vowels:
	if prev_ltr not in vowels or prev_ltr == '':
	wordre = ''.join([wordre, '[', vowels, ']+'])
	prev_ltr = ltr
	else:
	return 'NO SUGGESTION'

	if len(wordre) <= 0:
	return 'NO SUGGESTION'

	regex = re.compile(wordre)
	with open('/usr/share/dict/words','r') as words:
	for w in words:
	m = regex.match(w)
	if m is not None:
	return m.group(0)

	return 'NO SUGGESTION'


	class SpellCheck:
	'''
	This is my second attempt.

	It preloads the dictionary into a set for faster searching and uses
	a breadth first search algorithm.

	I am happier with its answers, but it can consume endless memory on
	pathologically long cases that have lots of vowels. For example,
	wwoweyuoaweoaueaowewasoiue

	I'd love to work on this some more, but I've used up all my time.

	I think I would like to try a solution that combines the 2 methods next,
	or a version of the first attempt that grabs all the valid words and
	determines the best one.
	'''

	def __init__(self):
	'''
	Pre-generate a word list so we don't do it at every attempt.

	Use a set() because it has constant (O(1)) lookup time.
	http://wiki.python.org/moin/PythonSpeed#Use_the_best_algorithms_and_fastest_tools
	'''
	self.words = set()
	with open('/usr/share/dict/words','r') as f:
	for word in f:
	self.words.add(word.strip())

	def gen_possible(self, word):
	'''
	Generator for all the possible misspellings of "word".

	This is implemented as a breadth first search.

	This should also count as the extra credit.
	'''

	# First try the word itself
	yield word

	# Next, try it in all lowercase
	word = word.lower()

	vowels = 'aeiou' # let's not consider 'y' a vowel in this case
	revowel = re.compile(r'[aeiou]')
	repeat = re.compile(r'([a-z])\1')

	visited = set()
	possible = [word]
	while len(possible) > 0:
	word = possible.pop()
	yield word

	if word not in visited:
	# Now try replacing vowels
	matches = revowel.finditer(word)
	for m in matches: # m is for "match"
	s = word[:m.start(0)] # s is for "start"
	e = word[m.end(0):] # e is for "end"
	for v in vowels: # v is for "vowel"
	w = ''.join([s, v, e]) # w is for "word"
	if w not in possible:
	possible.append(w)

	# Try removing duplicates
	matches = repeat.finditer(word)
	for m in matches:
	s = word[:m.start(0)] # s is for "start"
	e = word[m.end(0)-1:] # e is for "end"
	w = ''.join([s, e]) # w is for "word"
	if w not in possible:
	possible.append(w)

	visited.add(word)
	return


	def spellcheck(self, word):
	'''
	The following are tests using `doctest`. To run the tests do:

	python spellcheck.py -t

	TESTS
	>>> s.spellcheck('sandwich')
	'sandwich'
	>>> s.spellcheck('little')
	'little'
	>>> s.spellcheck('sheeeeep')
	'sheep'
	>>> s.spellcheck('peepple')
	'people'
	>>> s.spellcheck('sheeple')
	'NO SUGGESTION'
	>>> s.spellcheck('inSIDE')
	'inside'
	>>> s.spellcheck('jjoobbb')
	'job'
	>>> s.spellcheck('weke')
	'wiki'
	>>> s.spellcheck('CUNsperrICY')
	'conspiracy'
	>>> s.spellcheck('FOOD')
	'food'

	PATHOLOGICAL TESTS
	>>> s.spellcheck('')
	'NO SUGGESTION'
	>>> s.spellcheck('1337')
	'NO SUGGESTION'
	>>> s.spellcheck('woot!')
	'NO SUGGESTION'
	>>> s.spellcheck('two words')
	'NO SUGGESTION'
	>>> s.spellcheck(u'ಠ_ಠ')
	'NO SUGGESTION'
	>>> s.spellcheck('fo0bar')
	'NO SUGGESTION'
	'''

	for w in self.gen_possible(word):
	if w in self.words:
	return w

	return 'NO SUGGESTION'


	if __name__ == '__main__':

	s = SpellCheck()

	if len(sys.argv) > 1 and sys.argv[1] == '-t':
	import doctest
	doctest.testmod(extraglobs={'s': s })
	sys.exit(0)

	while True:
	word = raw_input('> ')
	print s.spellcheck(word)