Last active
December 10, 2015 09:59
-
-
Save mbal/4418394 to your computer and use it in GitHub Desktop.
Detect language with Bayes and langid corpus (number 28 at http://nltk.googlecode.com/svn/trunk/nltk_data/index.xml)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import collections | |
import operator | |
import string | |
from math import log | |
def parse_file(name): | |
f = open(name) | |
x = {} | |
c = 0 | |
for i, line in enumerate(f): | |
if i > 3000: | |
break | |
line = line.strip().split() | |
#this dataset uses < and > to represent spaces, so we sostitute it back | |
line[1] = line[1].replace('<', ' ').replace('>', ' ') | |
if len(line) == 2: | |
x[line[1]] = int(line[0]) | |
c = c + int(line[0]) | |
for k in x: | |
x[k] = float(x[k]+1)/(c+3000) | |
return x | |
def generate_trigrams(testo): | |
for i, c in enumerate(testo[:-2]): | |
yield ''.join((c, testo[i+1], testo[i+2])) | |
def get_trigram_count(text): | |
d = collections.defaultdict(lambda: 0) | |
#delete non alphabetic chars | |
testo = filter(lambda x: x in string.letters + ' ', text) | |
for i in generate_trigrams(text): | |
d[i] += 1 | |
total = sum(d.values()) | |
for k in d: | |
d[k] = d[k]/float(total) | |
return d | |
def bayes(text, langs): | |
probs = {x: 0 for x in langs} | |
for lang in langs: | |
model = parse_file("langid/%s-3grams.txt"%lang) | |
highest3000 = sorted(get_trigram_count(text).iteritems(), key=operator.itemgetter(1), reverse=True) | |
for k, v in highest3000: | |
try: | |
probs[lang] += log(model[k]) | |
except KeyError: | |
#if the trigram doesn't exists, we give it a fixed prob. | |
probs[lang] += log(1.0/3000) | |
maxl, lang = probs[langs[0]], langs[0] | |
for l in probs: | |
if probs[l] > maxl: | |
maxl = probs[l] | |
lang = l | |
return lang, maxl | |
text = open('file.txt', 'r').read() | |
print(bayes(text, ['it', 'en', 'fr', 'nl'])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment