zbriscoe/bigram.py

## bigram.py
import re

#italian
ital_text = re.sub('[^a-z]+', ' ', open('italian.txt').read().lower()).strip() + ' '

ital = {}
for i in xrange(len(ital_text) - 1):
    bigram = ital_text[i : i + 2]
    if bigram in ital:
        continue
    ital[bigram] = ital_text.count(bigram)

#french
fren_text = re.sub('[^a-z]+', ' ', open('french.txt').read().lower()).strip() + ' '

fren = {}
for i in xrange(len(fren_text) - 1):
    bigram = fren_text[i : i + 2]
    if bigram in fren:
        continue
    fren[bigram] = fren_text.count(bigram)

#english
eng_text = re.sub('[^a-z]+', ' ', open('english.txt').read().lower()).strip() + ' '

eng = {}
for i in xrange(len(eng_text) - 1):
    bigram = eng_text[i : i + 2]
    if bigram in eng:
        continue
    eng[bigram] = eng_text.count(bigram)


#function for probability of bigram
# = freq(bigram) + DELTA / (N + DELTA*B)
def bigram_prob( bigram, text, dictionary ):

    DELTA = 0.5

    B = 27*27 - 1

    frequency = 0

    #check frequency
    if bigram in dictionary:
		frequency = dictionary[bigram]

	#N is the number of bigram instances in the training set
    n = len(text) - 1

    probability = (frequency + DELTA) / (n + DELTA*B)

    return probability
	import re

	#italian
	ital_text = re.sub('[^a-z]+', ' ', open('italian.txt').read().lower()).strip() + ' '

	ital = {}
	for i in xrange(len(ital_text) - 1):
	bigram = ital_text[i : i + 2]
	if bigram in ital:
	continue
	ital[bigram] = ital_text.count(bigram)

	#french
	fren_text = re.sub('[^a-z]+', ' ', open('french.txt').read().lower()).strip() + ' '

	fren = {}
	for i in xrange(len(fren_text) - 1):
	bigram = fren_text[i : i + 2]
	if bigram in fren:
	continue
	fren[bigram] = fren_text.count(bigram)

	#english
	eng_text = re.sub('[^a-z]+', ' ', open('english.txt').read().lower()).strip() + ' '

	eng = {}
	for i in xrange(len(eng_text) - 1):
	bigram = eng_text[i : i + 2]
	if bigram in eng:
	continue
	eng[bigram] = eng_text.count(bigram)


	#function for probability of bigram
	# = freq(bigram) + DELTA / (N + DELTA*B)
	def bigram_prob( bigram, text, dictionary ):

	DELTA = 0.5

	B = 27*27 - 1

	frequency = 0

	#check frequency
	if bigram in dictionary:
	frequency = dictionary[bigram]

	#N is the number of bigram instances in the training set
	n = len(text) - 1

	probability = (frequency + DELTA) / (n + DELTA*B)

	return probability