Skip to content

Instantly share code, notes, and snippets.

@7shi
Last active May 19, 2020 14:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save 7shi/adb49f584096d2c68d7839dd5a6f4fe7 to your computer and use it in GitHub Desktop.
Save 7shi/adb49f584096d2c68d7839dd5a6f4fe7 to your computer and use it in GitHub Desktop.
[py] SSML converter for Interlingue/Occidental
# CC0 http://creativecommons.org/publicdomain/zero/1.0/
import getopt, re, sys
lang = "ro-RO" # ca-ES, pl-PL, ro-RO, sk-SK, sl-SI
testwords = [] #"intercalar forme familie rapidmen duplic café ínpossibil".split()
phonemes = {}
def setphonemes(phs):
for ph in phs.split():
p1, p2 = ph.split(",")
phonemes[p1] = p2
setphonemes("a,A b,B ć,TS d,D e,E f,F g,G h,H")
setphonemes("i,I j,ZH k,K l,L m,M n,N o,O p,P")
setphonemes("r,R s,S ŝ,SH t,T u,U v,V w,W y,J z,DZ ź,Z")
setphonemes("á,s1|A é,s1|E í,s1|I ó,s1|O ú,s1|U")
repls = [pair.split(",") for pair in """
ch,ŝ ce,će ci,ći c,k qu,kw x,ks ge,je gi,ji ss,s
tia,ćia tie,ćie tio,ćio tiu,ćiu
""".strip().split()]
def getph(ch):
return phonemes[ch] if ch in phonemes else ""
def getphoneme(word):
ret = []
word = re.sub("([aeiou])s([aeiou])", r"\1ź\2", word.lower())
for a, b in repls:
word = word.replace(a, b)
for ch in word:
phs = getph(ch).split("|")
if phs: ret += phs
return ret
if testwords:
print("# getphoneme")
for w in testwords:
print(w, "->", getphoneme(w))
class Parser:
def __init__(self, src):
self.i = iter(src)
self.cur = None
def peek(self):
if self.cur: return self.cur
try:
self.cur = next(self.i)
except StopIteration:
pass
return self.cur
def read(self):
ret = self.peek()
self.cur = None
return ret
def accept(self):
if self.cur:
self.cur = None
self.peek()
def isconsonant(ph):
return ph and not ph in "AEIOU"
def syllablize(phs):
p = Parser(reversed(phs))
ret = []
cur = []
while (ph := p.read()):
cur.insert(0, ph)
if isconsonant(ph): continue
c1 = p.peek()
if c1 == "s1" or ((c1 == "I" or c1 == "U") and ph != c1):
p.accept()
cur.insert(0, c1)
c1 = p.peek()
if isconsonant(c1):
p.accept()
cur.insert(0, c1)
if isconsonant(c2 := p.peek()):
if c1 in "LR" and c2 != c1:
p.accept()
cur.insert(0, c2)
else:
cc = c2 + c1
if cc in ["KW"]:
p.accept()
cur.insert(0, c2)
ret.insert(0, cur)
cur = []
if cur:
if ret:
ret[0] = cur + ret[0]
else:
ret = [cur]
return ret
if testwords:
print("# syllablize")
for w in testwords:
print(w, "->", syllablize(getphoneme(w)))
def setaccent(syls):
ac = None
s1 = list(filter(lambda phs: "s1" in phs, syls))
if s1:
ac = s1[0]
elif len(syls) >= 2:
last = "".join(syls[-1])
if last.endswith("S"): last = last[:-1]
if last == "DIE":
ac = syls[-1]
elif isconsonant(last[-1]):
for sfx in ["BIL", "IK", "IM", "UL", "UM", "MEN"]:
if last.endswith(sfx):
ac = syls[-2]
break
if not ac: ac = syls[-1]
if not ac: ac = syls[-2]
ac.insert(0, "s1")
if ac and not isconsonant(ac[-1]):
ac.append("lng")
return syls
if testwords:
print("# setaccent")
for w in testwords:
print(w, "->", setaccent(syllablize(getphoneme(w))))
def combine(syls):
return " . ".join(map(" ".join, syls))
def getups(word):
ret = combine(setaccent(syllablize(getphoneme(word))))
if lang in ["ro-RO", "sk-SK"]: ret = ret.replace("W", "U")
return ret
if testwords:
print("# getups")
for w in testwords:
print(w, "->", getups(w))
def readtoken(p):
ch = p.peek()
if not ch: return None
ret = ""
while (ch := p.peek()) and str.isalpha(ch) or ch == "-":
p.accept()
ret += ch
if ret: return (True, ret)
while (ch := p.peek()) and not str.isalpha(ch):
p.accept()
ret += ch
return (False, ret)
ssmlhdr = """
<?xml version="1.0" encoding="UTF-8"?>
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="%s">
""".lstrip()
def getssml(text):
ssml = ssmlhdr % lang
p = Parser(text)
while (t := readtoken(p)):
alpha, token = t
if alpha:
ssml += '<phoneme alphabet="ups" ph="%s">%s</phoneme>' % (getups(token), token)
else:
ssml += token
if not ssml.endswith("\n"): ssml += "\n"
return ssml + '</speak>'
if testwords:
print("# getssml")
text = "Patre nor, qui es in li cieles,"
print(text)
print("-" * 32)
print(getssml(text))
options = "l:f:"
def usage():
print("usage: %s -l lang -f file | text ..." % sys.argv[0])
exit(1)
if __name__ == "__main__":
text = None
try:
opts, args = getopt.getopt(sys.argv[1:], options)
except getopt.GetoptError as e:
print(e)
usage()
for opt, optarg in opts:
if opt == "-l": lang = optarg
elif opt == "-f":
with open(optarg, encoding="utf-8") as f:
text = f.read()
if lang == "ca-ES": phonemes["r"] = "rr"
if not text: text = " ".join(args)
if not text: usage()
print(getssml(text))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment