# mbal/bayesdetect.py Last active Dec 10, 2015

Detect language with Bayes and langid corpus (number 28 at http://nltk.googlecode.com/svn/trunk/nltk_data/index.xml)
 import collections import operator import string from math import log def parse_file(name): f = open(name) x = {} c = 0 for i, line in enumerate(f): if i > 3000: break line = line.strip().split() #this dataset uses < and > to represent spaces, so we sostitute it back line[1] = line[1].replace('<', ' ').replace('>', ' ') if len(line) == 2: x[line[1]] = int(line[0]) c = c + int(line[0]) for k in x: x[k] = float(x[k]+1)/(c+3000) return x def generate_trigrams(testo): for i, c in enumerate(testo[:-2]): yield ''.join((c, testo[i+1], testo[i+2])) def get_trigram_count(text): d = collections.defaultdict(lambda: 0) #delete non alphabetic chars testo = filter(lambda x: x in string.letters + ' ', text) for i in generate_trigrams(text): d[i] += 1 total = sum(d.values()) for k in d: d[k] = d[k]/float(total) return d def bayes(text, langs): probs = {x: 0 for x in langs} for lang in langs: model = parse_file("langid/%s-3grams.txt"%lang) highest3000 = sorted(get_trigram_count(text).iteritems(), key=operator.itemgetter(1), reverse=True) for k, v in highest3000: try: probs[lang] += log(model[k]) except KeyError: #if the trigram doesn't exists, we give it a fixed prob. probs[lang] += log(1.0/3000) maxl, lang = probs[langs[0]], langs[0] for l in probs: if probs[l] > maxl: maxl = probs[l] lang = l return lang, maxl text = open('file.txt', 'r').read() print(bayes(text, ['it', 'en', 'fr', 'nl']))