Skip to content
Create a gist now

Instantly share code, notes, and snippets.

@mbal /
Last active

Embed URL


Subversion checkout URL

You can clone with
Download ZIP
Detect language with Bayes and langid corpus (number 28 at
import collections
import operator
import string
from math import log
def parse_file(name):
f = open(name)
x = {}
c = 0
for i, line in enumerate(f):
if i > 3000:
line = line.strip().split()
#this dataset uses < and > to represent spaces, so we sostitute it back
line[1] = line[1].replace('<', ' ').replace('>', ' ')
if len(line) == 2:
x[line[1]] = int(line[0])
c = c + int(line[0])
for k in x:
x[k] = float(x[k]+1)/(c+3000)
return x
def generate_trigrams(testo):
for i, c in enumerate(testo[:-2]):
yield ''.join((c, testo[i+1], testo[i+2]))
def get_trigram_count(text):
d = collections.defaultdict(lambda: 0)
#delete non alphabetic chars
testo = filter(lambda x: x in string.letters + ' ', text)
for i in generate_trigrams(text):
d[i] += 1
total = sum(d.values())
for k in d:
d[k] = d[k]/float(total)
return d
def bayes(text, langs):
probs = {x: 0 for x in langs}
for lang in langs:
model = parse_file("langid/%s-3grams.txt"%lang)
highest3000 = sorted(get_trigram_count(text).iteritems(), key=operator.itemgetter(1), reverse=True)
for k, v in highest3000:
probs[lang] += log(model[k])
except KeyError:
#if the trigram doesn't exists, we give it a fixed prob.
probs[lang] += log(1.0/3000)
maxl, lang = probs[langs[0]], langs[0]
for l in probs:
if probs[l] > maxl:
maxl = probs[l]
lang = l
return lang, maxl
text = open('file.txt', 'r').read()
print(bayes(text, ['it', 'en', 'fr', 'nl']))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.