Skip to content

Instantly share code, notes, and snippets.

@mbal
Last active December 10, 2015 09:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save mbal/4418394 to your computer and use it in GitHub Desktop.
Save mbal/4418394 to your computer and use it in GitHub Desktop.
Detect language with Bayes and langid corpus (number 28 at http://nltk.googlecode.com/svn/trunk/nltk_data/index.xml)
import collections
import operator
import string
from math import log
def parse_file(name):
f = open(name)
x = {}
c = 0
for i, line in enumerate(f):
if i > 3000:
break
line = line.strip().split()
#this dataset uses < and > to represent spaces, so we sostitute it back
line[1] = line[1].replace('<', ' ').replace('>', ' ')
if len(line) == 2:
x[line[1]] = int(line[0])
c = c + int(line[0])
for k in x:
x[k] = float(x[k]+1)/(c+3000)
return x
def generate_trigrams(testo):
for i, c in enumerate(testo[:-2]):
yield ''.join((c, testo[i+1], testo[i+2]))
def get_trigram_count(text):
d = collections.defaultdict(lambda: 0)
#delete non alphabetic chars
testo = filter(lambda x: x in string.letters + ' ', text)
for i in generate_trigrams(text):
d[i] += 1
total = sum(d.values())
for k in d:
d[k] = d[k]/float(total)
return d
def bayes(text, langs):
probs = {x: 0 for x in langs}
for lang in langs:
model = parse_file("langid/%s-3grams.txt"%lang)
highest3000 = sorted(get_trigram_count(text).iteritems(), key=operator.itemgetter(1), reverse=True)
for k, v in highest3000:
try:
probs[lang] += log(model[k])
except KeyError:
#if the trigram doesn't exists, we give it a fixed prob.
probs[lang] += log(1.0/3000)
maxl, lang = probs[langs[0]], langs[0]
for l in probs:
if probs[l] > maxl:
maxl = probs[l]
lang = l
return lang, maxl
text = open('file.txt', 'r').read()
print(bayes(text, ['it', 'en', 'fr', 'nl']))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment