Skip to content

Instantly share code, notes, and snippets.

@shantanuo
Forked from santhoshtr/syllabify-with-index.py
Created February 10, 2012 07:10
Show Gist options
  • Save shantanuo/1787352 to your computer and use it in GitHub Desktop.
Save shantanuo/1787352 to your computer and use it in GitHub Desktop.
syllabify with word and syllable index
#!/usr/bin/python
# -*- coding: utf-8 -*-
# texts =[u"वाराणसी", u"भौगोलिक", u"उपदर्शन"]
# CREATE TABLE syllabalize (id int(11) DEFAULT NULL, seqn int(11) DEFAULT NULL, akshar varchar(10) DEFAULT NULL) DEFAULT CHARSET=utf8
import codecs
f = codecs.open('testfile.txt', encoding='utf-8')
texts = f.read().split()
signs = [
u'\u0902', u'\u0903', u'\u093e', u'\u093f', u'\u0940', u'\u0941',
u'\u0942', u'\u0943', u'\u0944', u'\u0946', u'\u0947', u'\u0948',
u'\u094a', u'\u094b', u'\u094c', u'\u094d', u'\u0901']
limiters = ['.','\"','\'','`','!',';',',','?']
virama = u'\u094d'
text_index = 1
for text in texts:
lst_chars = []
for char in text:
if char in limiters:
lst_chars.append(char)
elif char in signs:
lst_chars[-1] = lst_chars[-1] + char
else:
try:
if lst_chars[-1][-1] == virama:
lst_chars[-1] = lst_chars[-1] + char
else:
lst_chars.append(char)
except IndexError:
lst_chars.append(char)
index = 1
for syllable in lst_chars:
# print text_index, index , syllable
print "mysql -e\"insert into test.syllabalize values (",text_index, ",", index, ",", "'",syllable.encode('UTF-8'),"')\""
index+=1
text_index+=1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment