bwrsandman/bigram.py

## bigram.py
#!/usr/bin/env python
import sys
if sys.version_info >= (3,):
	xrange = range
	raw_input = input

from re import sub
from numpy import log10

def trim(string):
	return ' ' + sub('[^A-Z]+', ' ', string.upper()).strip() + ' '

def tokenize(filename):
	text = trim(open(filename).read())
	bigrams = {}
	for i in xrange(len(text) - 1):
		bigram = text[i : i + 2]
		if bigram in bigrams:
			continue
		bigrams[bigram] = text.count(bigram)
	return bigrams, text

#function for probability of bigram
# = freq(bigram) + DELTA / (N + DELTA*B)
def bigram_prob( bigram, text, dictionary ):
    DELTA = 0.5
    B = 27*27 - 1
    frequency = 0
    #check frequency
    if bigram in dictionary:
        frequency = dictionary[bigram]
	#N is the number of bigram instances in the training set
    n = len(text) - 1
    probability = (frequency + DELTA) / (n + DELTA*B)
    return probability

def guess(sen, dictionaries, texts):
	sentence = trim(sen)
	probs = [0.0] * len(texts)
	print("The sentence is: '%s'" % sen)
	for i in xrange(len(sentence) - 1):
		bigram = sentence[i: i + 2]
		print("BIGRAM: '%s'" % bigram)
		for j  in xrange(len(langs)):
			prob = bigram_prob(bigram, texts[j], dictionaries[j])
			probs[j] += log10(prob)
			print("%-8s P('%s', '%s') = %e ==> log prob of sequence so far: %f"
					% (langs[j].upper() + ":", bigram[0], bigram[1], prob, probs[j]))
		print()
	print("The sentence '%s' is %s." % (sen, langs[probs.index(max(probs))].title()))
	print()


langs = ["french", "english", "italian"]
dictionaries = []
texts = []
probs = []
for lang in langs:
	d, text = tokenize('%s.txt' % lang)
	dictionaries.append(d)
	texts.append(text)

while (True):
	print("Enter a sentence to identify. (Press Ctrl+C to exit)")
	try:
		sentence = raw_input()
	except:
		print()
		break
	if(not sentence):
		continue
	guess(sentence, dictionaries, texts)
	#!/usr/bin/env python
	import sys
	if sys.version_info >= (3,):
	xrange = range
	raw_input = input

	from re import sub
	from numpy import log10

	def trim(string):
	return ' ' + sub('[^A-Z]+', ' ', string.upper()).strip() + ' '

	def tokenize(filename):
	text = trim(open(filename).read())
	bigrams = {}
	for i in xrange(len(text) - 1):
	bigram = text[i : i + 2]
	if bigram in bigrams:
	continue
	bigrams[bigram] = text.count(bigram)
	return bigrams, text

	#function for probability of bigram
	# = freq(bigram) + DELTA / (N + DELTA*B)
	def bigram_prob( bigram, text, dictionary ):
	DELTA = 0.5
	B = 27*27 - 1
	frequency = 0
	#check frequency
	if bigram in dictionary:
	frequency = dictionary[bigram]
	#N is the number of bigram instances in the training set
	n = len(text) - 1
	probability = (frequency + DELTA) / (n + DELTA*B)
	return probability

	def guess(sen, dictionaries, texts):
	sentence = trim(sen)
	probs = [0.0] * len(texts)
	print("The sentence is: '%s'" % sen)
	for i in xrange(len(sentence) - 1):
	bigram = sentence[i: i + 2]
	print("BIGRAM: '%s'" % bigram)
	for j in xrange(len(langs)):
	prob = bigram_prob(bigram, texts[j], dictionaries[j])
	probs[j] += log10(prob)
	print("%-8s P('%s', '%s') = %e ==> log prob of sequence so far: %f"
	% (langs[j].upper() + ":", bigram[0], bigram[1], prob, probs[j]))
	print()
	print("The sentence '%s' is %s." % (sen, langs[probs.index(max(probs))].title()))
	print()


	langs = ["french", "english", "italian"]
	dictionaries = []
	texts = []
	probs = []
	for lang in langs:
	d, text = tokenize('%s.txt' % lang)
	dictionaries.append(d)
	texts.append(text)

	while (True):
	print("Enter a sentence to identify. (Press Ctrl+C to exit)")
	try:
	sentence = raw_input()
	except:
	print()
	break
	if(not sentence):
	continue
	guess(sentence, dictionaries, texts)