Create a gist now

Instantly share code, notes, and snippets.

@mbal /
Last active Dec 10, 2015

What would you like to do?
Detect language with Bayes and langid corpus (number 28 at
import collections
import operator
import string
from math import log
def parse_file(name):
f = open(name)
x = {}
c = 0
for i, line in enumerate(f):
if i > 3000:
line = line.strip().split()
#this dataset uses < and > to represent spaces, so we sostitute it back
line[1] = line[1].replace('<', ' ').replace('>', ' ')
if len(line) == 2:
x[line[1]] = int(line[0])
c = c + int(line[0])
for k in x:
x[k] = float(x[k]+1)/(c+3000)
return x
def generate_trigrams(testo):
for i, c in enumerate(testo[:-2]):
yield ''.join((c, testo[i+1], testo[i+2]))
def get_trigram_count(text):
d = collections.defaultdict(lambda: 0)
#delete non alphabetic chars
testo = filter(lambda x: x in string.letters + ' ', text)
for i in generate_trigrams(text):
d[i] += 1
total = sum(d.values())
for k in d:
d[k] = d[k]/float(total)
return d
def bayes(text, langs):
probs = {x: 0 for x in langs}
for lang in langs:
model = parse_file("langid/%s-3grams.txt"%lang)
highest3000 = sorted(get_trigram_count(text).iteritems(), key=operator.itemgetter(1), reverse=True)
for k, v in highest3000:
probs[lang] += log(model[k])
except KeyError:
#if the trigram doesn't exists, we give it a fixed prob.
probs[lang] += log(1.0/3000)
maxl, lang = probs[langs[0]], langs[0]
for l in probs:
if probs[l] > maxl:
maxl = probs[l]
lang = l
return lang, maxl
text = open('file.txt', 'r').read()
print(bayes(text, ['it', 'en', 'fr', 'nl']))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment