BoltzmannBrain/Python spell check

## Python spell check
#-------------------------------------------------------------------------------
# Name:        Did you mean 'python spell check'?
# Purpose:     Correct the spelling of an input string.
#
# Author:      Alex Lavin
#
# Created:     17/08/2014
# Copyright:   (c) Lavin 2014
#-------------------------------------------------------------------------------

import re, string, operator
from collections import Counter
from math import log10

## Process the input text database
sample_text = file('big.txt').read()
def tokens(text):
    "List all word tokens of text, normalized to lowercase."
    return re.findall('[a-z]+', text.lower())
VOCAB = Counter(tokens(sample_text))

## Text analysis functions
def edits_1(word):
    "Return all string that are one edit away from word."
    splits = [(word[:i], word[i:]) for i in range(len(word)+1)] # all possible splits of word
    insertions =     [a+b+c for a,b in splits for c in string.ascii_lowercase]
    deletions =      [a+b[1:] for a,b in splits if b] # at each split, delete the first character of 'b'
    substitutions =  [a+c+b[1:] for a,b in splits for c in string.ascii_lowercase if b] # same as deletion, but additionally insert 'c'
    transpositions = [a+b[1]+b[0]+b[2:] for a,b in splits if len(b)>1]
    return set(insertions + deletions + substitutions + transpositions)
def edits_2(word): # call edit function twice -> edit distance two
    "Return all string that are two edits away from word."
    return set(e2 for e1 in edits_1(word) for e2 in edits_1(e1))
def known(words):
    "Return subset of words that appear in the dictionary."
    return filter(VOCAB.get, words)
def correct(word, version=0):
    "Return the best spelling correction of word."
    if version==1: # Prefer edit distances of 0, 1, then 2, and then default to original word.
        candidates = (known({word}) or
                      known(edits_1(word)) or
                      known(edits_2(word)) or
                      [word])
        return max(candidates, key=VOCAB.get)
    candidates = edits(word).items()
    c, edit = max(candidates, key=lambda (c,e): Pedit(e) * P1w(c))
    return c
def correct_text(text):
    "Return the correct spelling of all words in text."
    return re.sub('[a-zA-Z]+', correct_match, text)
def correct_match(match):
    "Correct the spelling of match, first making it lowercase and calling correct(), then putting in the proper case."
    word = match.group()
    return case(word)(correct(word.lower()))
def case(text):
    "Return the case function for text."
    return(str.upper if text.isupper() else
           str.lower if text.islower() else
           str.title if text.istitle() else
           str)

## Probability distribution
class Pdist(dict):
    "A probability distribution estimated from counts in datafile."
    def __init__(self, data=[], N=None, smoothingfn=None):
        for key,count in data:
            self[key] = self.get(key, 0) + int(count)
        self.N = float(N or sum(self.itervalues()))
        self.smoothingfn = smoothingfn or (lambda k, N: 1./N)
    def __call__(self, key):
        if key in self: return self[key]/self.N
        else: return self.smoothingfn(key, self.N)
def Laplace(counter, c=1):
    N = sum(counter.values())
    Nnew = N + c * (len(counter)+1)
    return lambda w: (counter[w]+c) / Nnew
def cPw(word, prev):
    "Conditional probability of word, given previous word."
    bigram = prev + ' ' + word
    if P2w(bigram)>0 and P1w(prev)>0:
        return P2w(bigram) / P1w(prev)
    else: # average the back-off value and zero
        return P1w(word) / 2
def Pedit(edit):
    "Returns the probability of an edit."
    if edit == '': return (1. - p_error)
    return p_error*product(P1edit(e) for e in edit.split('+'))

## Segmentation
def memo(f):
    "Memoize function f, whose args must all be hashable."
    cache = {}
    def fmemo(*args):
        if args not in cache:
            cache[args] = f(*args)
        return cache[args]
    fmemo.cache = cache
    return fmemo
@memo
def segment(text, prev='<S>'):
    "Return best segmentation of text; words and their logP."
    if not text: return 0.0, []
    candidates = (combine(log10(cPw(first,prev)), first, segment(rest, first))  # log10 to avoid underflow
                   for first,rest in splits(text))
    return max(candidates)
L = max(map(len, VOCAB)) # longest word we've seen
def splits(text, L=20):
    """Return a list of all (first, rest) pairs of the text,
    where start<=len(first)<=L+2."""
    return [(text[:i+1], text[i+1:])
            for i in range(min(len(text),L+2))]
def combine(Pfirst, first, (Prest, rest)):
    "Combine first and rest into words/probability."
    return Pfirst+Prest, [first]+rest

## From Norvig
def edits(word, d=2):
    "Return a dict of {correct: edit} pairs within d edits of word."
    results = {}
    def editsR(hd, tl, d, edits):
        def ed(L,R): return edits+[R+'|'+L]
        C = hd+tl
        if C in P1w:
            e = '+'.join(edits)
            if C not in results: results[C] = e
            else: results[C] = max(results[C], e, key=Pedit)
        if d <= 0: return
        extensions = [hd+c for c in string.ascii_lowercase if hd+c in PREFIXES]
        p = (hd[-1] if hd else '<') ## previous character
        ## Insertion
        for h in extensions:
            editsR(h, tl, d-1, ed(p+h[-1], p))
        if not tl: return
        ## Deletion
        editsR(hd, tl[1:], d-1, ed(p, p+tl[0]))
        for h in extensions:
            if h[-1] == tl[0]: ## Match
                editsR(h, tl[1:], d, edits)
            else: ## Replacement
                editsR(h, tl[1:], d-1, ed(h[-1], tl[0]))
        ## Transpose
        if len(tl)>=2 and tl[0]!=tl[1] and hd+tl[1] in PREFIXES:
            editsR(hd+tl[1], tl[0]+tl[2:], d-1,
                   ed(tl[1]+tl[0], tl[0:2]))
    ## Body of edits:
    editsR('', word, d, [])
    return results

## Helper functions
def count_data(name, sep='\t'):
    with open(name) as f:
        for line in f:
            yield line.split(sep)
def product(numbers):
    "Return the product of a sequence of numbers."
    return reduce(operator.mul, numbers, 1)

## Run program
N = 1024908267229 # Number of tokens in 'big.txt'
P1w = Pdist(count_data('unigram counts.txt'), N)
P2w = Pdist(count_data('bigram counts.txt'), N)
p_error = 1./10. # assume spelling error every 10 words
PREFIXES = set(w[:i] for w in P1w for i in range(len(w) + 1)) # Norvig code
P1edit = Pdist(count_data('count_1edit.txt')) # Probabilities of single edits

tests = ['mimosa brunhc resterants', 'Where is teh Golden Gtae Birdge?', 'mimosa brunhc resterants near teh Goldengate Birdge?']
for test in tests:
    new = correct_text(test)
    print test, '-> Did you mean "', new, '"?'
	#-------------------------------------------------------------------------------
	# Name: Did you mean 'python spell check'?
	# Purpose: Correct the spelling of an input string.
	#
	# Author: Alex Lavin
	#
	# Created: 17/08/2014
	# Copyright: (c) Lavin 2014
	#-------------------------------------------------------------------------------

	import re, string, operator
	from collections import Counter
	from math import log10

	## Process the input text database
	sample_text = file('big.txt').read()
	def tokens(text):
	"List all word tokens of text, normalized to lowercase."
	return re.findall('[a-z]+', text.lower())
	VOCAB = Counter(tokens(sample_text))

	## Text analysis functions
	def edits_1(word):
	"Return all string that are one edit away from word."
	splits = [(word[:i], word[i:]) for i in range(len(word)+1)] # all possible splits of word
	insertions = [a+b+c for a,b in splits for c in string.ascii_lowercase]
	deletions = [a+b[1:] for a,b in splits if b] # at each split, delete the first character of 'b'
	substitutions = [a+c+b[1:] for a,b in splits for c in string.ascii_lowercase if b] # same as deletion, but additionally insert 'c'
	transpositions = [a+b[1]+b[0]+b[2:] for a,b in splits if len(b)>1]
	return set(insertions + deletions + substitutions + transpositions)
	def edits_2(word): # call edit function twice -> edit distance two
	"Return all string that are two edits away from word."
	return set(e2 for e1 in edits_1(word) for e2 in edits_1(e1))
	def known(words):
	"Return subset of words that appear in the dictionary."
	return filter(VOCAB.get, words)
	def correct(word, version=0):
	"Return the best spelling correction of word."
	if version==1: # Prefer edit distances of 0, 1, then 2, and then default to original word.
	candidates = (known({word}) or
	known(edits_1(word)) or
	known(edits_2(word)) or
	[word])
	return max(candidates, key=VOCAB.get)
	candidates = edits(word).items()
	c, edit = max(candidates, key=lambda (c,e): Pedit(e) * P1w(c))
	return c
	def correct_text(text):
	"Return the correct spelling of all words in text."
	return re.sub('[a-zA-Z]+', correct_match, text)
	def correct_match(match):
	"Correct the spelling of match, first making it lowercase and calling correct(), then putting in the proper case."
	word = match.group()
	return case(word)(correct(word.lower()))
	def case(text):
	"Return the case function for text."
	return(str.upper if text.isupper() else
	str.lower if text.islower() else
	str.title if text.istitle() else
	str)

	## Probability distribution
	class Pdist(dict):
	"A probability distribution estimated from counts in datafile."
	def __init__(self, data=[], N=None, smoothingfn=None):
	for key,count in data:
	self[key] = self.get(key, 0) + int(count)
	self.N = float(N or sum(self.itervalues()))
	self.smoothingfn = smoothingfn or (lambda k, N: 1./N)
	def __call__(self, key):
	if key in self: return self[key]/self.N
	else: return self.smoothingfn(key, self.N)
	def Laplace(counter, c=1):
	N = sum(counter.values())
	Nnew = N + c * (len(counter)+1)
	return lambda w: (counter[w]+c) / Nnew
	def cPw(word, prev):
	"Conditional probability of word, given previous word."
	bigram = prev + ' ' + word
	if P2w(bigram)>0 and P1w(prev)>0:
	return P2w(bigram) / P1w(prev)
	else: # average the back-off value and zero
	return P1w(word) / 2
	def Pedit(edit):
	"Returns the probability of an edit."
	if edit == '': return (1. - p_error)
	return p_error*product(P1edit(e) for e in edit.split('+'))

	## Segmentation
	def memo(f):
	"Memoize function f, whose args must all be hashable."
	cache = {}
	def fmemo(*args):
	if args not in cache:
	cache[args] = f(*args)
	return cache[args]
	fmemo.cache = cache
	return fmemo
	@memo
	def segment(text, prev='<S>'):
	"Return best segmentation of text; words and their logP."
	if not text: return 0.0, []
	candidates = (combine(log10(cPw(first,prev)), first, segment(rest, first)) # log10 to avoid underflow
	for first,rest in splits(text))
	return max(candidates)
	L = max(map(len, VOCAB)) # longest word we've seen
	def splits(text, L=20):
	"""Return a list of all (first, rest) pairs of the text,
	where start<=len(first)<=L+2."""
	return [(text[:i+1], text[i+1:])
	for i in range(min(len(text),L+2))]
	def combine(Pfirst, first, (Prest, rest)):
	"Combine first and rest into words/probability."
	return Pfirst+Prest, [first]+rest

	## From Norvig
	def edits(word, d=2):
	"Return a dict of {correct: edit} pairs within d edits of word."
	results = {}
	def editsR(hd, tl, d, edits):
	def ed(L,R): return edits+[R+'\|'+L]
	C = hd+tl
	if C in P1w:
	e = '+'.join(edits)
	if C not in results: results[C] = e
	else: results[C] = max(results[C], e, key=Pedit)
	if d <= 0: return
	extensions = [hd+c for c in string.ascii_lowercase if hd+c in PREFIXES]
	p = (hd[-1] if hd else '<') ## previous character
	## Insertion
	for h in extensions:
	editsR(h, tl, d-1, ed(p+h[-1], p))
	if not tl: return
	## Deletion
	editsR(hd, tl[1:], d-1, ed(p, p+tl[0]))
	for h in extensions:
	if h[-1] == tl[0]: ## Match
	editsR(h, tl[1:], d, edits)
	else: ## Replacement
	editsR(h, tl[1:], d-1, ed(h[-1], tl[0]))
	## Transpose
	if len(tl)>=2 and tl[0]!=tl[1] and hd+tl[1] in PREFIXES:
	editsR(hd+tl[1], tl[0]+tl[2:], d-1,
	ed(tl[1]+tl[0], tl[0:2]))
	## Body of edits:
	editsR('', word, d, [])
	return results

	## Helper functions
	def count_data(name, sep='\t'):
	with open(name) as f:
	for line in f:
	yield line.split(sep)
	def product(numbers):
	"Return the product of a sequence of numbers."
	return reduce(operator.mul, numbers, 1)

	## Run program
	N = 1024908267229 # Number of tokens in 'big.txt'
	P1w = Pdist(count_data('unigram counts.txt'), N)
	P2w = Pdist(count_data('bigram counts.txt'), N)
	p_error = 1./10. # assume spelling error every 10 words
	PREFIXES = set(w[:i] for w in P1w for i in range(len(w) + 1)) # Norvig code
	P1edit = Pdist(count_data('count_1edit.txt')) # Probabilities of single edits

	tests = ['mimosa brunhc resterants', 'Where is teh Golden Gtae Birdge?', 'mimosa brunhc resterants near teh Goldengate Birdge?']
	for test in tests:
	new = correct_text(test)
	print test, '-> Did you mean "', new, '"?'