-
-
Save bwrsandman/4187628 to your computer and use it in GitHub Desktop.
Bigram
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
if sys.version_info >= (3,): | |
xrange = range | |
raw_input = input | |
from re import sub | |
from numpy import log10 | |
def trim(string): | |
return ' ' + sub('[^A-Z]+', ' ', string.upper()).strip() + ' ' | |
def tokenize(filename): | |
text = trim(open(filename).read()) | |
bigrams = {} | |
for i in xrange(len(text) - 1): | |
bigram = text[i : i + 2] | |
if bigram in bigrams: | |
continue | |
bigrams[bigram] = text.count(bigram) | |
return bigrams, text | |
#function for probability of bigram | |
# = freq(bigram) + DELTA / (N + DELTA*B) | |
def bigram_prob( bigram, text, dictionary ): | |
DELTA = 0.5 | |
B = 27*27 - 1 | |
frequency = 0 | |
#check frequency | |
if bigram in dictionary: | |
frequency = dictionary[bigram] | |
#N is the number of bigram instances in the training set | |
n = len(text) - 1 | |
probability = (frequency + DELTA) / (n + DELTA*B) | |
return probability | |
def guess(sen, dictionaries, texts): | |
sentence = trim(sen) | |
probs = [0.0] * len(texts) | |
print("The sentence is: '%s'" % sen) | |
for i in xrange(len(sentence) - 1): | |
bigram = sentence[i: i + 2] | |
print("BIGRAM: '%s'" % bigram) | |
for j in xrange(len(langs)): | |
prob = bigram_prob(bigram, texts[j], dictionaries[j]) | |
probs[j] += log10(prob) | |
print("%-8s P('%s', '%s') = %e ==> log prob of sequence so far: %f" | |
% (langs[j].upper() + ":", bigram[0], bigram[1], prob, probs[j])) | |
print() | |
print("The sentence '%s' is %s." % (sen, langs[probs.index(max(probs))].title())) | |
print() | |
langs = ["french", "english", "italian"] | |
dictionaries = [] | |
texts = [] | |
probs = [] | |
for lang in langs: | |
d, text = tokenize('%s.txt' % lang) | |
dictionaries.append(d) | |
texts.append(text) | |
while (True): | |
print("Enter a sentence to identify. (Press Ctrl+C to exit)") | |
try: | |
sentence = raw_input() | |
except: | |
print() | |
break | |
if(not sentence): | |
continue | |
guess(sentence, dictionaries, texts) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Feel free to change the sentences.
you can run it like:
The output is long so, you might want to pipe it to a file:
$ ./bigram.py > output.txt
Alternatively, you can add an argparser that only prints the verbose stuff when "-v" is an argument