Skip to content

Instantly share code, notes, and snippets.

@srikanthlogic
Created February 20, 2012 10:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save srikanthlogic/1868666 to your computer and use it in GitHub Desktop.
Save srikanthlogic/1868666 to your computer and use it in GitHub Desktop.
Files for Tamil-English Reverse Transliterator https://github.com/santhoshtr/silpa
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# Any Indian Language to any other Indian language transliterator
# Copyright 2008-2010 Santhosh Thottingal <santhosh.thottingal@gmail.com>
# http://www.smc.org.in
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Library General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
# If you find any bugs or have any suggestions email: santhosh.thottingal@gmail.com
# URL: http://www.smc.org.in
import string
import os
CMU_MALAYALAM_MAP = {
"AA" : "à´“",
"AH" : "à´…",
"AE" : "ഏ",
"AO" : "à´“",
"AW" : "à´”",
"AY" : "ഐ",
"B" : "ബ്",
"CH" : "ച്ച്",
"D" : "ഡ്",
"DH" : "ദ്",
"EA" : "à´ˆ",
"EH" : "à´Ž",
"ER" : "എര്‍",
"EY" : "എയ്",
"F" : "ഫ്",
"G" : "ഗ്",
"HH" : "ഹ്",
"IH" : "à´‡",
"IY" : "à´ˆ",
"J" : "ജ്",
"JH" : "ജ്",
"K" : "ക്",
"L" : "ല്‍",
"M" : "മ്",
"N" : "ന്‍",
"NG" : "ങ്",
"OW" : "à´’",
"P" : "പ്",
"R" : "ര്‍",
"S" : "സ്",
"SH" : "ഷ്",
"T" : "റ്റ്",
"TH" : "ത്",
"Y" : "യ്",
"UW" : "à´‰",
"W" : "വ്",
"V" : "വ്",
"Z" : "സ്",
}
CMU_TAMIL_MAP = {
"AA" : "ஆ",
"AH" : "à®…",
"AE" : "எ",
"AO" : "à®’",
"AW" : "à®”",
"AY" : "ஐ",
"B" : "பி",
"CH" : "ச்",
"D" : "ட்",
"DH" : "த்",
"EA" : "à´ˆ",
"EH" : "ஏ",
"ER" : "அர்",
"EY" : "ஏ",
"F" : "ஃப்",
"G" : "க்",
"HH" : "ஹ்",
"IH" : "இ",
"IY" : "இ",
"J" : "ஜ்",
"JH" : "ஜ்",
"K" : "க்",
"L" : "ல்",
"M" : "ம்",
"N" : "ன்",
"NG" : "ங்",
"OW" : "à®”",
"P" : "ப்",
"R" : "ர்",
"S" : "ச்",
"SH" : "ஷ்",
"T" : "ட்",
"TH" : "த்",
"Y" : "ய்",
"UW" : "உ",
"W" : "வ",
"V" : "வ",
"Z" : "ஸ்",
}
CMU_KANNADA_MAP = {
"AA" : "ಆ",
"AH" : "ಅ",
"AE" : "ಏ",
"AO" : "ಓ",
"AW" : "ಔ",
"AY" : "ಐ",
"B" : "ಬ್",
"CH" : "ಚ್",
"D" : "ಡ್",
"DH" : "ದ್",
"EA" : "ಈ",
"EH" : "ಎ",
"ER" : "ಅರ್",
"EY" : "ಎಯ್",
"F" : "ಫ್",
"G" : "ಗ್",
"HH" : "ಹ್",
"IH" : "ಇ",
"IY" : "ಈ",
"J" : "ಜ್",
"JH" : "ಜ್",
"K" : "ಕ್",
"L" : "ಲ್",
"M" : "ಮ್",
"N" : "ನ್",
"NG" : "ಂಗ್",
"OW" : "ಒ",
"P" : "ಪ್",
"R" : "ರ್",
"S" : "ಸ್",
"SH" : "ಷ್",
"T" : "ಟ್",
"TH" : "ತ್",
"Y" : "ಯ್",
"UW" : "ಊ",
"UH":"ಉ",
"W" : "ವ್",
"V" : "ವ್",
"Z":"ಸ್",
"ZH":"ಷ್",
}
class CMUDict():
def __init__(self):
self.dictionaryfile=os.path.join(os.path.dirname(__file__), 'cmudict.0.7a_SPHINX_40')
self.cmudictionary = None
def load(self):
fdict = open(self.dictionaryfile, "r")
flines = fdict.readlines()
linecount = len(flines)
self.cmudictionary = dict()
for line in flines:
line = line.strip()
lhs = line.split()[0]
rhs = line.split()[1:]
self.cmudictionary[lhs] = rhs
def find(self, word):
if self.cmudictionary== None:
self.load()
return self.cmudictionary[word.upper()]
def pronunciation(self,word, language):
stripped_word = word.strip('!,.?:')
punctuations = word[len(stripped_word):]
try:
cmu_pronunciation = self.find(stripped_word)
except KeyError:
#print "could not find the word " + stripped_word + " in dictionary"
return word
pronunciation_str = ""
if language =="ml_IN":
for syl in cmu_pronunciation:
try:
pronunciation_str += CMU_MALAYALAM_MAP[syl]
except KeyError:
pronunciation_str += syl
pronunciation_str = self._fix_vowel_signs_ml(pronunciation_str)
if language == "kn_IN":
for symbol in cmu_pronunciation:
try:
pronunciation_str += CMU_KANNADA_MAP[symbol]
except KeyError:
pronunciation_str += symbol
pronunciation_str = self._fix_vowel_signs_kn(pronunciation_str)
return (pronunciation_str).decode("utf-8") + punctuations
if language == "ta_IN":
for symbol in cmu_pronunciation:
try:
pronunciation_str += CMU_TAMIL_MAP[symbol]
except KeyError:
pronunciation_str += symbol
#pronunciation_str = self._fix_vowel_signs_kn(pronunciation_str)
return (pronunciation_str).decode("utf-8") + punctuations
def _fix_vowel_signs_ml(self,text) :
text= text.replace("്അ","")
text= text.replace("്‍അ","")
text= text.replace("്ആ","ാ")
text= text.replace("്‍ആ","ാ")
text= text.replace("്ഇ","ി")
text= text.replace("്‍ഇ","ി")
text= text.replace("്ഈ","ീ")
text= text.replace("്‍ഈ","ീ")
text= text.replace("്ഉ","ു")
text= text.replace("്‍ഉ","ു")
text= text.replace("്ഊ","ൂ")
text= text.replace("്‍ഊ","ൂ")
text= text.replace("്റ","്ര")
text= text.replace("്എ","െ")
text= text.replace("്‍എ","")
text= text.replace("്ഏ","േ")
text= text.replace("്‍ഏ","േ")
text= text.replace("്ഐ","ൈ")
text= text.replace("്‍ഐ","ൈ")
text= text.replace("്ഒ","ൊ")
text= text.replace("്‍ഒ","ൊ")
text= text.replace("്ഓ","ോ")
text= text.replace("്‍ഓ","ോ")
text= text.replace("്ഔ","ൌ")
text= text.replace("്‍ഔ","ൌ")
text= text.replace("ര്ര","റ്റ")
text= text.replace("റ്ര","റ്റ")
text= text.replace("ന്‍റ്റ","ന്റ")
return text
def _fix_vowel_signs_kn(self,text) :
text= text.replace("್ಅ","")
text= text.replace("್ಆ","ಾ")
text= text.replace("್ಇ","ಿ")
text= text.replace("್ಈ","ೀ")
text= text.replace("್ಉ","ು")
text= text.replace("್ಊ","ೂ")
text= text.replace("್ಋ","ೃ")
text= text.replace("್ಎ","ೆ")
text= text.replace("್ಏ","ೇ")
text= text.replace("್ಐ","ೈ")
text= text.replace("್ಒ","ೊ")
text= text.replace("್ಓ","ೋ")
text= text.replace("್ಔ","ೌ")
return text
#!/usr/bin/python
# -*- coding: utf-8 -*-
#indic_en.py
#
#Copyright 2010 Vasudev Kamath <kamathvasudev@gmail.com>
#
#This program is free software; you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation; either version 3 of the License, or
#(at your option) any later version.
#
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#GNU General Public License for more details.
#
#You should have received a copy of the GNU General Public License
#along with this program; if not, write to the Free Software
#Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#MA 02110-1301, USA.
#
'''
This file contains all language related dictionaries vowel and vowel signs
and function which returns a dictionary or vowel or vowel signs for a language
Trying to make indic_en transliteration more generic
'''
kannada_english_dict = {u'ಅ':'a',u'ಆ':'aa',u'ಇ':'i',u'ಈ':'i',u'ಉ':'u',\
u'ಊ':'u',u'ಋ':'rri',u'ಎ':'e',u'ಏ':'e',u'ಐ':'ai',\
u'ಒ':'o',u'ಓ':'o',u'ಔ':'au',u'ಂ':'m',u'ಃ':'h',\
u'ಕ':'k',u'ಖ':'kh',u'ಗ':'g',u'ಘ':'gh',u'ಙ':'ng',\
u'ಚ':'ch',u'ಛ':'chh',u'ಜ':'j',u'ಝ':'jhh',u'ಞ':'nj',\
u'ತ':'th',u'ಥ':'thh',u'ದ':'d',u'ಧ':'dh',u'ನ':'n',\
u'ಟ':'T',u'ಠ':'Th',u'ಡ':'D',u'ಢ':'Dh',u'ಣ':'N',\
u'ಪ':'p',u'ಫ':'ph',u'ಬ':'b',u'ಭ':'bh',u'ಮ':'m',\
u'ಯ':'y',u'ರ':'r',u'ಲ':'l',u'ವ':'v',u'ಶ':'sh',\
u'ಷ':'shh',u'ಸ':'s',u'ಹ':'h',u'ಳ':'L',\
u'ಋ':'rri',u'್':'',u'ಾ':'aa',u'ಿ':'i',u'ೀ':'i',\
u'ು':'u',u'ೂ':'u',u'ೃ':'rri',u'ೆ':'e',u'ೇ':'e',\
u'ೈ':'ai',u'ೊ':'o',u'ೋ':'o',u'ೌ':'au',\
u'ಕ್ಷ':'ksh',u'ತ್ರ':'tr',u'ಜ್ಞ':'jn',\
u'೧':'1',u'೨':'2',u'೩':'3',u'೪':'4',u'೫':'5',\
u'೬':'6',u'೭':'7',u'೮':'8',u'೯':'9',u'೦':'0'}
kn_vowels = [u'ಅ',u'ಆ',u'ಇ',u'ಈ',u'ಉ',u'ಊ',u'ಋ',u'ಎ',u'ಏ',u'ಐ',\
u'ಒ',u'ಓ',u'ಔ']
kn_vowel_signs = [u'್',u'ಾ',u'ಿ',u'ೀ',u'ು',u'ೂ',u'ೃ',u'ೆ',u'ೇ',\
u'ೈ',u'ೊ',u'ೋ',u'ೌ',u'ಂ',u'ಃ',u' ']
tamil_english_dict = {u'அ':'a',u'ஆ':'aa',u'இ':'i',u'ஈ':'ii',u'உ':'u',u'ஊ':'uu',u'எ':'e',\
u'ஏ':'ee',u'ஐ':'ai',u'ஒ':'o',u'ஓ':'oo',u'ஔ':'au',\
u'க்':'k', u'ங்':'ng', u'ச்':'s','ஞ்':'nj', u'ட்':'d', u'ண்':'N', u'த்':'th', u'ந்':'w',\
u'ப்':'p', u'ம்':'m', u'ய்':'y', u'ர்':'r', u'ல்':'l', u'வ்':'v', u'ழ்':'zh', u'ள்':'L', u'ற்':'R', u'ன்':'n'}
tamil_vowels = [u'அ',u'ஆ',u'இ',u'ஈ',u'உ',u'ஊ',u'எ',\
u'ஏ',u'ஐ',u'ஒ',u'ஓ',u'ஔ',u'ஃ']
tamil_vowel_signs = [u'ா',u'ி',u'ீ',u'ு',u'ூ',u'ெ',u'ே',u'ை',u'ொ',u'ோ',u'ௌ',u'்']
malayalam_english_dict={u'à´…':'a',u'à´†':'aa',u'à´‡':'i',u'à´ˆ':'ee',u'à´‰':'u',u'à´Š':'oo',u'à´‹':'ri',\
u'എ':'e',u'ഏ':'e',u'ഐ':'ai',u'ഒ':'o',u'ഓ':'o',u'ഔ':'au',\
u'ക':'k',u'ഖ':'kh',u'ഗ':'g',u'ഘ':'gh',u'ങ്ങ':'ng',u'ങ':'ng',\
u'ച':'ch',u'ഛ':'chh',u'ജ':'j',u'ഝ':'jhh',u'ഞ':'nj',\
u'à´Ÿ':'t',u'à´ ':'th',u'à´¡':'d',u'à´¢':'dh',u'à´£':'n',\
u'à´¤':'th',u'à´¥':'th',u'à´¦':'d',u'à´§':'dh',u'à´¨':'n',\
u'à´ª':'p',u'à´«':'ph',u'à´¬':'b',u'à´­':'bh',u'à´®':'m',\
u'à´¯':'y',u'à´°':'r',u'à´²':'l', u'à´µ':'v', u'à´±':'r',\
u'à´¶':'s',u'à´·':'sh',u'à´¸':'s', u'à´¹':'h',u'à´³':'l',u'à´´':'zh',\
u'്':'',u'ം':'m',u'ാ':'aa',u'ി':'i' ,u'ീ':'ee' ,u'ു':'u',\
u'ൂ':'oo',u'ൃ':'ri' ,u'െ':'e' ,u'േ':'e',\
u'ൈ':'ai',u'ൊ':'o' ,u'ോ':'oo' ,u'ൗ':'au', u'ൌ':'ou'}
ml_vowels = [u'അ',u'ആ',u'ഇ',u'ഈ',u'ഉ' ,u'ഊ',u'ഋ', u'എ',u'ഏ',u'ഐ',\
u'à´’',u'à´“',u'à´”']
ml_vowel_signs = [u'്',u'ം',u'ാ',u'ി',u'ീ',u'ു', u'ൂ',u'ൃ' ,u'െ' ,u'േ',\
u'ൈ',u'ൊ' ,u'ോ' ,u'ൗ' , u'ൌ',u'‍']
# P.S: Please declare all language related variables above this and
# fill in the following mapping as you add dictionary vowels and
# vowel_signs for your language
# language dictionary mapping
language_dictionary = {"kn_IN":kannada_english_dict,\
"ml_IN":malayalam_english_dict}
# language vowels mapping
language_vowels = {"kn_IN":kn_vowels,"ml_IN":ml_vowels}
# language vowel signs mapping
language_vowel_signs = {"kn_IN":kn_vowel_signs,\
"ml_IN":ml_vowel_signs}
# language virama sign mapping
language_virama = {"kn_IN":u"್","ml_IN":u"്"}
# language anuswara sign mapping
language_anuswara = {"kn_IN":u"ಂ","ml_IN":u'ം'}
def get_dictionary_for(lang="ml_IN"):
"""
Returns the 'language'_english_dict if there
is no dictionary available for a language then
return ml_IN dictionary
i.e cycle through language -> ml_IN -> en_US
Arguments:
- `lang`: Language for which dictionary is required
"""
return language_dictionary.get(lang,"ml_IN")
def get_vowels_for(lang="ml_IN"):
"""
Returns the 'lang'_vowels list. If vowel list
is not available for a language retrun list for
ml_IN
Arguments:
- `lang`: Language for which vowel list should be returned
"""
return language_vowels.get(lang,"ml_IN")
def get_vowel_signs_for(lang="ml_IN"):
"""
Returns the 'lang'_vowels list. If vowel list
is not available for a language retrun list for
ml_IN
Arguments:
- `lang`: Language for which vowel signs list should be returned
"""
return language_vowel_signs.get(lang,"ml_IN")
def get_virama_for(lang="ml_IN"):
"""
Return the virama symbol for given language
Arguments:
- `lang`: Language for which virama symbol should be returned
"""
return language_virama.get(lang,"ml_IN")
def get_anuswara_for(lang="ml_IN"):
"""
Return the anuswara symbol for the language
Arguments:
- `lang`: Language for which anuswara symbol is needed
"""
return language_anuswara.get(lang,"ml_IN")
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# Any Indian Language to any other Indian language transliterator
# Copyright 2009-2010 Santhosh Thottingal <santhosh.thottingal@gmail.com>
# http://www.smc.org.in
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
# If you find any bugs or have any suggestions
# email: santhosh.thottingal@gmail.com
# URL: http://www.smc.org.in
from common import *
from utils import *
import string
import os
from cmudict import CMUDict
from indic_en import *
class Transliterator(SilpaModule):
def __init__(self):
self.template=os.path.join(os.path.dirname(__file__),\
'transliterate.html')
self.cmu = CMUDict()
self.response = SilpaResponse(self.template)
def transliterate_en_ml(self, word):
"""
Transliterate English to Malayalam with the help of
CMU pronuciation dictionary
"""
return self.cmu.pronunciation(word,"ml_IN")
def transliterate_en_kn(self, word):
"""
Transliterate English to Kannada with the help of
CMU pronuciation dictionary
"""
return self.cmu.pronunciation(word,"kn_IN")
def transliterate_en_ta(self, word):
"""
Transliterate English to Tamil with the help of
CMU pronuciation dictionary
"""
return self.cmu.pronunciation(word,"ta_IN")
def transliterate_en_xx(self,word, target_lang):
"""
Transliterate English to any Indian Language.
"""
if target_lang=="en_IN" or target_lang=="en_US":
return word
if target_lang == "kn_IN":
tx_str = self.transliterate_en_kn(word)
if target_lang == "ta_IN":
tx_str = self.transliterate_en_ta(word)
return tx_str
else:
tx_str = self.transliterate_en_ml(word)
if target_lang == "ml_IN":
return tx_str
#chain it through indic indic transliteratioin
#first remove malayalam specific zwj
tx_str = tx_str.replace(u'‍', '') # remove instances of zwnj
if tx_str[-1:] == u'്' and (target_lang == "hi_IN"\
or target_lang == "gu_IN"\
or target_lang == "bn_IN" ) :
tx_str = tx_str[:-(len(u'്'))] #remove the last virama'
return self.transliterate_indic_indic(tx_str, "ml_IN", target_lang)
def transliterate_xx_en(self,word, src_lang):
"""
Transliterate Indian Language to English.
"""
if src_lang == "en_IN" or src_lang == "en_US":
return word
# TODO: the function is generic now so no need of testing the lanuguage
# but since the indic_en contains only for kn_IN and ml_IN we need this
# check.
# Add all indic language to indic_en
# remplace this block with single call to indic_en function
if src_lang == "kn_IN":
return self.transliterate_indic_en(word,src_lang)
if not src_lang == "ml_IN":
word = self.transliterate_indic_indic(word, src_lang, "ml_IN")
return self.transliterate_indic_en(word,"ml_IN")
def transliterate_iso15919(self, word, src_language):
tx_str = ""
index=0;
word_length = len(word)
for chr in word:
index+=1
offset = ord(chr) - lang_bases[src_language]
#76 is the virama offset for all indian languages from its base
if offset >= 61 and offset <=76:
tx_str = tx_str[:-1] #remove the last 'a'
if offset>0 and offset<=128:
tx_str = tx_str + charmap["ISO15919"][offset]
#delete the inherent 'a' at the end of the word from hindi
if tx_str[-1:]=='a' and (src_language == "hi_IN"\
or src_language == "gu_IN"\
or src_language == "bn_IN" ) :
if word_length == index and word_length>1: #if last letter
tx_str = tx_str[:-1] #remove the last 'a'
return tx_str .decode("utf-8")
def transliterate_ipa(self, word, src_language):
"""
Transliterate the given word in src_language to
IPA - International Phonetical Alphabet notation.
"""
tx_str = ""
index=0;
word_length = len(word)
for chr in word:
index+=1
if ord(chr) < 255 : #ASCII characters + English
tx_str += chr
continue
offset = ord(chr) - lang_bases[src_language]
#76 is the virama offset for all indian languages from its base
if offset >= 61 and offset <=76:
tx_str = tx_str[:-(len('É™'))] #remove the last 'É™'
if offset>0 and offset<=128:
tx_str = tx_str + charmap["IPA"][offset]
#delete the inherent 'a' at the end of the word from hindi
if tx_str[-1:]=='É™' and (src_language == "hi_IN"\
or src_language == "gu_IN"\
or src_language == "bn_IN" ) :
if word_length == index and word_length>1: #if last letter
tx_str = tx_str[:-(len('É™'))] #remove the last 'a'
return tx_str .decode("utf-8")
def _malayalam_fixes(self, text):
try:
text = text.replace(u"മ് ",u"ം ")
text = text.replace(u"മ്,",u"ം,")
text = text.replace(u"മ്.",u"ം.")
text = text.replace(u"മ്)",u"ം)")
text = text.replace(u"à´©",u"à´¨")
text = text.replace(u"൤",u".") #danda by fullstop
except:
pass
return text
def transliterate_indic_indic(self, word, src_lang, target_lang) :
"""
Transliterate from an Indian languge word
to another indian language word
"""
index = 0
tx_str = ""
word = normalizer.normalize(word)
if src_lang == "ml_IN" and target_lang != "ml_IN" :
word = word.replace(u"\u200C",u"")
word = word.replace(u"\u200D",u"")
#replace all samvruthokaram by u vowels
word = word.replace(u"ു്",u"")
for chr in word:
index += 1
if chr in string.punctuation or (ord(chr)<=2304 and ord(chr)>=3071):
tx_str = tx_str + chr
continue
offset = ord(chr) + self.getOffset(src_lang, target_lang)
if(offset>0):
tx_str = tx_str + unichr (offset)
#schwa deletion
baseoffset = offset - lang_bases[target_lang]
#76 : virama
if (index == len(word)
and baseoffset == 76
and ( target_lang == "hi_IN" or
target_lang == "gu_IN" or
target_lang == "pa_IN" or
target_lang == "bn_IN")) :
#TODO Add more languages having schwa deletion characteristic
tx_str = tx_str[:-(len(chr))] #remove the last 'a'
if target_lang == "ml_IN" and src_lang == "ta_IN":
tx_str = tx_str.replace(u"à´©" , u"à´¨")
if target_lang == "ta_IN":
tx_str = tx_str.replace(u'\u0B96' , u"க")
tx_str = tx_str.replace(u'\u0B97' , u"க")
tx_str = tx_str.replace(u'\u0B98' , u"க")
tx_str = tx_str.replace(u'\u0B9B' , u"ச")
tx_str = tx_str.replace(u'\u0B9D' , u"ச")
tx_str = tx_str.replace(u'\u0BA0' , u"ட")
tx_str = tx_str.replace(u'\u0BA1' , u"ட")
tx_str = tx_str.replace(u'\u0BA2' , u"ட")
tx_str = tx_str.replace(u'\u0BA5' , u"த")
tx_str = tx_str.replace(u'\u0BA6' , u"த")
tx_str = tx_str.replace(u'\u0BA7' , u"த")
tx_str = tx_str.replace(u'\u0BAB' , u"ப")
tx_str = tx_str.replace(u'\u0BAC' , u"ப")
tx_str = tx_str.replace(u'\u0BAD' , u"ப")
tx_str = tx_str.replace(u'\u0BC3' , u"ிரு")
tx_str = tx_str.replace(u'ஂ',u'ம்')
#If target is malayalam, we need to add the virama
if ( (target_lang == "ml_IN")
and (src_lang == "hi_IN" or
src_lang == "gu_IN" or
src_lang == "pa_IN" or
src_lang == "bn_IN")
and tx_str[-1].isalpha()
):
tx_str = tx_str+u"്"
return tx_str
def transliterate_indic_en(self,word,src_lang):
"""
Arguments:
- `self`:
- `word`: Word to be transliterated (sentence)
- `src_lang`: Language from which we need to transilterate
"""
# Get all the language related stuffs
dictionary = get_dictionary_for(src_lang)
vowels = get_vowels_for(src_lang)
vowel_signs = get_vowel_signs_for(src_lang)
virama = get_virama_for(src_lang)
anuswara = get_anuswara_for(src_lang)
word_length = len(word)
index = 0
tx_string = ""
while index < word_length:
# If current charachter is a punctuation symbol
# skip it.
# Added to avoid getting extra 'a' to the begining
# of word next to punctuation symbol
#
if word[index] in string.punctuation:
tx_string += word[index]
index += 1
continue
# Virama = conjucter
if word[index] == virama:
index+=1
continue;
# Get english equivalaent of the charachter.
try:
tx_string += dictionary[word[index]]
except KeyError:
# If charachter isn't present in the dict
# just append the charachter to string
# This case is now handled by punctuation checking
tx_string += word[index]
if index+1 < word_length and not word[index+1] in vowel_signs\
and word[index+1] in dictionary \
and not word[index] in vowels\
and not word[index] in vowel_signs :
tx_string +='a'
if index+1 == word_length and not word[index] in vowel_signs\
and word[index] in dictionary:
tx_string +='a'
#handle am sign
if index+1 < word_length and word[index+1] == anuswara\
and not word[index] in vowel_signs:
tx_string += 'a'
index+=1
return tx_string
@ServiceMethod
def transliterate(self,text, target_lang_code):
tx_str=""
lines=text.split("\n")
for line in lines:
words=line.split(" ")
for word in words:
if(word.strip()>""):
try:
src_lang_code=detect_lang(word)[word]
except:
tx_str = tx_str + " " + word
continue #FIXME
if target_lang_code=="ISO15919" :
tx_str=tx_str + \
self.transliterate_iso15919(word, src_lang_code)\
+ " "
continue
if target_lang_code=="IPA" :
tx_str=tx_str + \
self.transliterate_ipa(word, src_lang_code) + " "
continue
if src_lang_code=="en_US" :
tx_str = tx_str + \
self.transliterate_en_xx(word, target_lang_code)+" "
continue
if target_lang_code=="en_US" or target_lang_code=="en_IN" :
tx_str=tx_str + \
self.transliterate_xx_en(word, src_lang_code) + " "
continue
tx_str += self.transliterate_indic_indic(word,\
src_lang_code,\
target_lang_code)
if len(lines)>1:
tx_str += " "
else:
tx_str = tx_str + word
if len(lines)>1:
tx_str += "\n"
# Language specific fixes
if target_lang_code == "ml_IN":
tx_str = self._malayalam_fixes(tx_str)
return tx_str
def getOffset(self,src,target):
src_id=0
target_id=0
try:
src_id=lang_bases[src]
target_id=lang_bases[target]
return (target_id - src_id)
except:
return 0
def get_module_name(self):
return "Transliterator"
def get_info(self):
return "Transliterate the text between any Indian Language"
def getInstance():
return Transliterator()
@tecoholic
Copy link

Boss putting 4/5 files as a gist is too much OK. this should be a repo only.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment