Last active
May 5, 2018 19:29
-
-
Save widiger-anna/5eec18823747be19276f279468c4acb9 to your computer and use it in GitHub Desktop.
Language Detection Test for EN and DE using trigrams
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import unicode_literals, print_function | |
import os | |
''' | |
Example of analyzing a string using a sliding window (trigrams, n=3) | |
counts all occurencies of trigrams, stores in a dictionary | |
uses stats of trigram frequencies to identify a language | |
''' | |
def read_file(filename): | |
if os.path.isfile(filename): | |
fh = open(filename,'rb') | |
c = fh.read() | |
fh.close() | |
return c | |
return False | |
def trigrams(text): | |
out = {}; | |
i = 0; | |
c = len(text) | |
while(i<c): | |
# trigram: construct a sequence of 3 letters starting at i | |
s = text[i]; # letter 1 | |
if(s == " "): | |
i = i+1 | |
continue; | |
if((i+1) < c): # letter 2 | |
s = s+text[i+1] | |
else: | |
s = s+" " | |
if((i+2) < c and s[1] != " "): # letter 3 | |
s = s+text[i+2] | |
else: | |
s = s+" " | |
s = s.lower() | |
if(out.has_key(s)): | |
out[s] = out[s]+1 | |
else: | |
out[s] = 1 | |
i = i+1 | |
return out; | |
def freq(text): | |
out = trigrams(text); | |
# convert the counts (the values in the dict) to stats | |
c = len(out.keys())*1.0 # slow for large corpora | |
for k in out: | |
out[k] = out[k]/c | |
return out | |
def lang(languages,sample): | |
stats = {} | |
for k in languages: | |
v = languages[k] | |
stats[k] = 0.0 | |
for k2 in sample: | |
if(v.has_key(k2)): | |
stats[k] = stats[k] + v[k2] | |
return stats | |
def which_lang(stats): | |
m = None | |
for k in stats: | |
if(m == None): | |
m = k | |
if(stats[k] > stats[m]): | |
m = k | |
return m | |
en = freq("hello") | |
de = freq("hallo") | |
en = freq(read_file("en.txt")) | |
de = freq(read_file("de.txt")) | |
#test = freq("guten tag"); | |
sample = freq("anna widiger") | |
stats = lang({"en":en,"de":de},sample) | |
print(which_lang(stats)) | |
test = [] | |
for k in de: | |
test.append([k,de[k]]) | |
def s(a,b): | |
if(a[1]>b[1]): | |
return 1 | |
return -1 | |
test.sort(s); | |
for t in test: | |
print(t[0],t[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment