widiger-anna/language_detection.py

## language_detection.py
from __future__ import unicode_literals, print_function
import os
'''
Example of analyzing a string using a sliding window (trigrams, n=3)
counts all occurencies of trigrams, stores in a dictionary
uses stats of trigram frequencies to identify a language
'''
def read_file(filename):
  if os.path.isfile(filename):
    fh = open(filename,'rb')
    c = fh.read()
    fh.close()
    return c
  return False

def trigrams(text):
  out = {};
  i = 0;
  c = len(text)
  while(i<c):
    # trigram: construct a sequence of 3 letters starting at i
    s = text[i]; # letter 1
    if(s == " "):
      i = i+1
      continue;
    if((i+1) < c):  # letter 2
      s = s+text[i+1]
    else:
      s = s+" "


    if((i+2) < c and s[1] != " "): # letter 3
      s = s+text[i+2]
    else:
      s = s+" "
    s = s.lower()

    if(out.has_key(s)):
      out[s] = out[s]+1
    else:
      out[s] = 1

    i = i+1
  return out;

def freq(text):
  out = trigrams(text);

  # convert the counts (the values in the dict) to stats
  c = len(out.keys())*1.0 # slow for large corpora
  for k in out:
    out[k] = out[k]/c
  return out

def lang(languages,sample):
  stats = {}
  for k in languages:
    v = languages[k]
    stats[k] = 0.0
    for k2 in sample:
      if(v.has_key(k2)):
        stats[k] = stats[k] + v[k2]
  return stats


def which_lang(stats):
  m = None
  for k in stats:
    if(m == None):
      m = k
    if(stats[k] > stats[m]):
      m = k
  return m

en = freq("hello")
de = freq("hallo")
en = freq(read_file("en.txt"))
de = freq(read_file("de.txt"))

#test = freq("guten tag");
sample = freq("anna widiger")

stats = lang({"en":en,"de":de},sample)

print(which_lang(stats))

test = []
for k in de:
  test.append([k,de[k]])

def s(a,b):
  if(a[1]>b[1]):
    return 1
  return -1
test.sort(s);

for t in test:
  print(t[0],t[1])
	from __future__ import unicode_literals, print_function
	import os
	'''
	Example of analyzing a string using a sliding window (trigrams, n=3)
	counts all occurencies of trigrams, stores in a dictionary
	uses stats of trigram frequencies to identify a language
	'''
	def read_file(filename):
	if os.path.isfile(filename):
	fh = open(filename,'rb')
	c = fh.read()
	fh.close()
	return c
	return False

	def trigrams(text):
	out = {};
	i = 0;
	c = len(text)
	while(i<c):
	# trigram: construct a sequence of 3 letters starting at i
	s = text[i]; # letter 1
	if(s == " "):
	i = i+1
	continue;
	if((i+1) < c): # letter 2
	s = s+text[i+1]
	else:
	s = s+" "


	if((i+2) < c and s[1] != " "): # letter 3
	s = s+text[i+2]
	else:
	s = s+" "
	s = s.lower()

	if(out.has_key(s)):
	out[s] = out[s]+1
	else:
	out[s] = 1

	i = i+1
	return out;

	def freq(text):
	out = trigrams(text);

	# convert the counts (the values in the dict) to stats
	c = len(out.keys())*1.0 # slow for large corpora
	for k in out:
	out[k] = out[k]/c
	return out

	def lang(languages,sample):
	stats = {}
	for k in languages:
	v = languages[k]
	stats[k] = 0.0
	for k2 in sample:
	if(v.has_key(k2)):
	stats[k] = stats[k] + v[k2]
	return stats


	def which_lang(stats):
	m = None
	for k in stats:
	if(m == None):
	m = k
	if(stats[k] > stats[m]):
	m = k
	return m

	en = freq("hello")
	de = freq("hallo")
	en = freq(read_file("en.txt"))
	de = freq(read_file("de.txt"))

	#test = freq("guten tag");
	sample = freq("anna widiger")

	stats = lang({"en":en,"de":de},sample)

	print(which_lang(stats))

	test = []
	for k in de:
	test.append([k,de[k]])

	def s(a,b):
	if(a[1]>b[1]):
	return 1
	return -1
	test.sort(s);

	for t in test:
	print(t[0],t[1])