Skip to content

Instantly share code, notes, and snippets.

@widiger-anna
Last active May 5, 2018 19:29
Show Gist options
  • Save widiger-anna/5eec18823747be19276f279468c4acb9 to your computer and use it in GitHub Desktop.
Save widiger-anna/5eec18823747be19276f279468c4acb9 to your computer and use it in GitHub Desktop.
Language Detection Test for EN and DE using trigrams
from __future__ import unicode_literals, print_function
import os
'''
Example of analyzing a string using a sliding window (trigrams, n=3)
counts all occurencies of trigrams, stores in a dictionary
uses stats of trigram frequencies to identify a language
'''
def read_file(filename):
if os.path.isfile(filename):
fh = open(filename,'rb')
c = fh.read()
fh.close()
return c
return False
def trigrams(text):
out = {};
i = 0;
c = len(text)
while(i<c):
# trigram: construct a sequence of 3 letters starting at i
s = text[i]; # letter 1
if(s == " "):
i = i+1
continue;
if((i+1) < c): # letter 2
s = s+text[i+1]
else:
s = s+" "
if((i+2) < c and s[1] != " "): # letter 3
s = s+text[i+2]
else:
s = s+" "
s = s.lower()
if(out.has_key(s)):
out[s] = out[s]+1
else:
out[s] = 1
i = i+1
return out;
def freq(text):
out = trigrams(text);
# convert the counts (the values in the dict) to stats
c = len(out.keys())*1.0 # slow for large corpora
for k in out:
out[k] = out[k]/c
return out
def lang(languages,sample):
stats = {}
for k in languages:
v = languages[k]
stats[k] = 0.0
for k2 in sample:
if(v.has_key(k2)):
stats[k] = stats[k] + v[k2]
return stats
def which_lang(stats):
m = None
for k in stats:
if(m == None):
m = k
if(stats[k] > stats[m]):
m = k
return m
en = freq("hello")
de = freq("hallo")
en = freq(read_file("en.txt"))
de = freq(read_file("de.txt"))
#test = freq("guten tag");
sample = freq("anna widiger")
stats = lang({"en":en,"de":de},sample)
print(which_lang(stats))
test = []
for k in de:
test.append([k,de[k]])
def s(a,b):
if(a[1]>b[1]):
return 1
return -1
test.sort(s);
for t in test:
print(t[0],t[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment