Skip to content

Instantly share code, notes, and snippets.

@arcturusannamalai
Created April 18, 2020 20:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arcturusannamalai/9372241294f5daeb968c82fd485710d6 to your computer and use it in GitHub Desktop.
Save arcturusannamalai/9372241294f5daeb968c82fd485710d6 to your computer and use it in GitHub Desktop.
#!/bin/env python3
from codecs import open
from tamil import utf8
import re
with open('kuttistory.txt','r','utf-8') as fp:
data = fp.readlines()
class Stats:
__fields__ = ('total_words','tamil_words')
stats = Stats()
stats.total_words=0.0
stats.tamil_words=0.0
for line in data:
all_words = re.split('\s+',line.strip())
ta_words = list(utf8.get_tamil_words(utf8.get_letters(line)))
print(all_words,len(ta_words))
stats.tamil_words += len( ta_words )
stats.total_words += len(all_words)
#tamil fraction
taf = float(stats.tamil_words)/stats.total_words
print('English = {0}%, Tamil = {1}%'.format(100.0*(1-taf),100.0*(taf)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment