Skip to content

Instantly share code, notes, and snippets.

@bwrsandman
Forked from zbriscoe/bigram.py
Created December 2, 2012 07:44
Show Gist options
  • Save bwrsandman/4187628 to your computer and use it in GitHub Desktop.
Save bwrsandman/4187628 to your computer and use it in GitHub Desktop.
Bigram
#!/usr/bin/env python
import sys
if sys.version_info >= (3,):
xrange = range
raw_input = input
from re import sub
from numpy import log10
def trim(string):
return ' ' + sub('[^A-Z]+', ' ', string.upper()).strip() + ' '
def tokenize(filename):
text = trim(open(filename).read())
bigrams = {}
for i in xrange(len(text) - 1):
bigram = text[i : i + 2]
if bigram in bigrams:
continue
bigrams[bigram] = text.count(bigram)
return bigrams, text
#function for probability of bigram
# = freq(bigram) + DELTA / (N + DELTA*B)
def bigram_prob( bigram, text, dictionary ):
DELTA = 0.5
B = 27*27 - 1
frequency = 0
#check frequency
if bigram in dictionary:
frequency = dictionary[bigram]
#N is the number of bigram instances in the training set
n = len(text) - 1
probability = (frequency + DELTA) / (n + DELTA*B)
return probability
def guess(sen, dictionaries, texts):
sentence = trim(sen)
probs = [0.0] * len(texts)
print("The sentence is: '%s'" % sen)
for i in xrange(len(sentence) - 1):
bigram = sentence[i: i + 2]
print("BIGRAM: '%s'" % bigram)
for j in xrange(len(langs)):
prob = bigram_prob(bigram, texts[j], dictionaries[j])
probs[j] += log10(prob)
print("%-8s P('%s', '%s') = %e ==> log prob of sequence so far: %f"
% (langs[j].upper() + ":", bigram[0], bigram[1], prob, probs[j]))
print()
print("The sentence '%s' is %s." % (sen, langs[probs.index(max(probs))].title()))
print()
langs = ["french", "english", "italian"]
dictionaries = []
texts = []
probs = []
for lang in langs:
d, text = tokenize('%s.txt' % lang)
dictionaries.append(d)
texts.append(text)
while (True):
print("Enter a sentence to identify. (Press Ctrl+C to exit)")
try:
sentence = raw_input()
except:
print()
break
if(not sentence):
continue
guess(sentence, dictionaries, texts)
@bwrsandman
Copy link
Author

Feel free to change the sentences.

you can run it like:

$ ./bigram.py

The output is long so, you might want to pipe it to a file:

$ ./bigram.py > output.txt

Alternatively, you can add an argparser that only prints the verbose stuff when "-v" is an argument

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment