public
Last active

Detect language with Bayes and langid corpus (number 28 at http://nltk.googlecode.com/svn/trunk/nltk_data/index.xml)

  • Download Gist
bayesdetect.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
import collections
import operator
import string
from math import log
 
def parse_file(name):
f = open(name)
x = {}
c = 0
for i, line in enumerate(f):
if i > 3000:
break
line = line.strip().split()
#this dataset uses < and > to represent spaces, so we sostitute it back
line[1] = line[1].replace('<', ' ').replace('>', ' ')
 
if len(line) == 2:
x[line[1]] = int(line[0])
c = c + int(line[0])
for k in x:
x[k] = float(x[k]+1)/(c+3000)
return x
 
def generate_trigrams(testo):
for i, c in enumerate(testo[:-2]):
yield ''.join((c, testo[i+1], testo[i+2]))
 
def get_trigram_count(text):
d = collections.defaultdict(lambda: 0)
#delete non alphabetic chars
testo = filter(lambda x: x in string.letters + ' ', text)
for i in generate_trigrams(text):
d[i] += 1
total = sum(d.values())
for k in d:
d[k] = d[k]/float(total)
return d
 
def bayes(text, langs):
probs = {x: 0 for x in langs}
for lang in langs:
model = parse_file("langid/%s-3grams.txt"%lang)
highest3000 = sorted(get_trigram_count(text).iteritems(), key=operator.itemgetter(1), reverse=True)
for k, v in highest3000:
try:
probs[lang] += log(model[k])
except KeyError:
#if the trigram doesn't exists, we give it a fixed prob.
probs[lang] += log(1.0/3000)
maxl, lang = probs[langs[0]], langs[0]
for l in probs:
if probs[l] > maxl:
maxl = probs[l]
lang = l
return lang, maxl
 
text = open('file.txt', 'r').read()
print(bayes(text, ['it', 'en', 'fr', 'nl']))

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.