Skip to content

Instantly share code, notes, and snippets.

@zbriscoe
Created December 2, 2012 07:14
Show Gist options
  • Save zbriscoe/4187559 to your computer and use it in GitHub Desktop.
Save zbriscoe/4187559 to your computer and use it in GitHub Desktop.
Bigram
import re
#italian
ital_text = re.sub('[^a-z]+', ' ', open('italian.txt').read().lower()).strip() + ' '
ital = {}
for i in xrange(len(ital_text) - 1):
bigram = ital_text[i : i + 2]
if bigram in ital:
continue
ital[bigram] = ital_text.count(bigram)
#french
fren_text = re.sub('[^a-z]+', ' ', open('french.txt').read().lower()).strip() + ' '
fren = {}
for i in xrange(len(fren_text) - 1):
bigram = fren_text[i : i + 2]
if bigram in fren:
continue
fren[bigram] = fren_text.count(bigram)
#english
eng_text = re.sub('[^a-z]+', ' ', open('english.txt').read().lower()).strip() + ' '
eng = {}
for i in xrange(len(eng_text) - 1):
bigram = eng_text[i : i + 2]
if bigram in eng:
continue
eng[bigram] = eng_text.count(bigram)
#function for probability of bigram
# = freq(bigram) + DELTA / (N + DELTA*B)
def bigram_prob( bigram, text, dictionary ):
DELTA = 0.5
B = 27*27 - 1
frequency = 0
#check frequency
if bigram in dictionary:
frequency = dictionary[bigram]
#N is the number of bigram instances in the training set
n = len(text) - 1
probability = (frequency + DELTA) / (n + DELTA*B)
return probability
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment