Skip to content

Instantly share code, notes, and snippets.

@7shi
Last active May 22, 2020
Embed
What would you like to do?
[py] SSML converter for Novial
# CC0 http://creativecommons.org/publicdomain/zero/1.0/
import getopt, re, sys
lang = "sk-SK" # ca-ES, cs-CZ, de-CH, de-DE, hr-HR, hu-HU, pl-PL, ro-RO, ru-RU, sk-SK, sl-SI
testwords = [] #"nusen patre intestines familie lause".split()
phonemes = {}
def setphonemes(phs):
for ph in phs.split():
p1, p2 = ph.split(",")
phonemes[p1] = p2
setphonemes("a,A b,B c,TS d,D e,E f,F g,G h,H")
setphonemes("i,I j,ZH k,K l,L m,M n,N o,O p,P")
setphonemes("r,R s,S ŝ,SH t,T u,U v,V w,W y,J z,Z")
repls = [pair.split(",") for pair in """
ch,ŝ ce,se ci,si c,k qu,kw sh,ŝ x,ks
au,aw eu,ew ai,ay ei,ey ie,ye
""".strip().split()]
def getph(ch):
return phonemes[ch] if ch in phonemes else ""
def getphoneme(word):
ret = []
word = word.lower()
for a, b in repls:
word = word.replace(a, b)
for ch in word:
phs = getph(ch).split("|")
if phs: ret += phs
return ret
if testwords:
print("# getphoneme")
for w in testwords:
print(w, "->", getphoneme(w))
class Parser:
def __init__(self, src):
self.i = iter(src)
self.cur = None
def peek(self):
if self.cur: return self.cur
try:
self.cur = next(self.i)
except StopIteration:
pass
return self.cur
def read(self):
ret = self.peek()
self.cur = None
return ret
def accept(self):
if self.cur:
self.cur = None
self.peek()
def isconsonant(ph):
return ph and not ph in "AEIOU"
def syllablize(phs):
p = Parser(reversed(phs))
ret = []
cur = []
while (ph := p.read()):
cur.insert(0, ph)
if isconsonant(ph): continue
c1 = p.peek()
if isconsonant(c1):
p.accept()
cur.insert(0, c1)
if isconsonant(c2 := p.peek()):
if c1 in "LR" and c2 != c1 and not c2 in "JW":
p.accept()
cur.insert(0, c2)
else:
cc = c2 + c1
if cc in ["KW"]:
p.accept()
cur.insert(0, c2)
ret.insert(0, cur)
cur = []
if cur:
if ret:
ret[0] = cur + ret[0]
else:
ret = [cur]
return ret
if testwords:
print("# syllablize")
for w in testwords:
print(w, "->", syllablize(getphoneme(w)))
def setaccent(syls):
ac = None
s1 = list(filter(lambda phs: "s1" in phs, syls))
if s1:
ac = s1[0]
elif len(syls) >= 2:
last = "".join(syls[-1])
for end in ["M", "N", "S", "D"]:
if last.endswith(end):
last = last[:-len(end)]
break
if last and isconsonant(last[-1]):
ac = syls[-1]
else:
for i in range(len(syls) - 1, 0, -1):
if isconsonant(syls[i][0]):
ac = syls[i - 1]
break
if not ac: ac = syls[0]
ac.insert(0, "s1")
elif syls and len(syls[0]) == 1:
ac = syls[0]
if ac and not isconsonant(ac[-1]):
ac.append("lng")
return syls
if testwords:
print("# setaccent")
for w in testwords:
print(w, "->", setaccent(syllablize(getphoneme(w))))
def combine(syls):
return " . ".join(map(" ".join, syls))
def getups(word):
syls = setaccent(syllablize(getphoneme(word)))
if lang in ["sk-SK"]:
for syl in syls:
if syl[-1] == "J": syl[-1] = "I"
ret = combine(syls)
if lang in ["ro-RO", "sk-SK", "ru-RU"]:
ret = ret.replace("W", "U")
return ret
if testwords:
print("# getups")
for w in testwords:
print(w, "->", getups(w))
def readtoken(p):
ch = p.peek()
if not ch: return None
ret = ""
while (ch := p.peek()) and str.isalpha(ch) or ch == "-":
p.accept()
ret += ch
if ret: return (True, ret)
while (ch := p.peek()) and not str.isalpha(ch):
p.accept()
ret += ch
return (False, ret)
ssmlhdr = """
<?xml version="1.0" encoding="UTF-8"?>
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="%s">
""".lstrip()
def getssml(text):
ssml = ssmlhdr % lang
p = Parser(text)
while (t := readtoken(p)):
alpha, token = t
if alpha:
ssml += '<phoneme alphabet="ups" ph="%s">%s</phoneme>' % (getups(token), token)
else:
ssml += token
if not ssml.endswith("\n"): ssml += "\n"
return ssml + '</speak>'
if testwords:
print("# getssml")
text = "Nusen Patre, kel es in siele,"
print(text)
print("-" * 32)
print(getssml(text))
options = "l:f:"
def usage():
print("usage: %s -l lang -f file | text ..." % sys.argv[0])
exit(1)
if __name__ == "__main__":
text = None
try:
opts, args = getopt.getopt(sys.argv[1:], options)
except getopt.GetoptError as e:
print(e)
usage()
for opt, optarg in opts:
if opt == "-l": lang = optarg
elif opt == "-f":
with open(optarg, encoding="utf-8") as f:
text = f.read()
if lang == "ca-ES": phonemes["r"] = "rr"
if not text: text = " ".join(args)
if not text: usage()
print(getssml(text))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment