Last active
May 22, 2020 16:26
-
-
Save 7shi/94d7b36ed8295939ca877ac9b82dae80 to your computer and use it in GitHub Desktop.
[py] SSML converter for Novial
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# CC0 http://creativecommons.org/publicdomain/zero/1.0/ | |
import getopt, re, sys | |
lang = "sk-SK" # ca-ES, cs-CZ, de-CH, de-DE, hr-HR, hu-HU, pl-PL, ro-RO, ru-RU, sk-SK, sl-SI | |
testwords = [] #"nusen patre intestines familie lause".split() | |
phonemes = {} | |
def setphonemes(phs): | |
for ph in phs.split(): | |
p1, p2 = ph.split(",") | |
phonemes[p1] = p2 | |
setphonemes("a,A b,B c,TS d,D e,E f,F g,G h,H") | |
setphonemes("i,I j,ZH k,K l,L m,M n,N o,O p,P") | |
setphonemes("r,R s,S ŝ,SH t,T u,U v,V w,W y,J z,Z") | |
repls = [pair.split(",") for pair in """ | |
ch,ŝ ce,se ci,si c,k qu,kw sh,ŝ x,ks | |
au,aw eu,ew ai,ay ei,ey ie,ye | |
""".strip().split()] | |
def getph(ch): | |
return phonemes[ch] if ch in phonemes else "" | |
def getphoneme(word): | |
ret = [] | |
word = word.lower() | |
for a, b in repls: | |
word = word.replace(a, b) | |
for ch in word: | |
phs = getph(ch).split("|") | |
if phs: ret += phs | |
return ret | |
if testwords: | |
print("# getphoneme") | |
for w in testwords: | |
print(w, "->", getphoneme(w)) | |
class Parser: | |
def __init__(self, src): | |
self.i = iter(src) | |
self.cur = None | |
def peek(self): | |
if self.cur: return self.cur | |
try: | |
self.cur = next(self.i) | |
except StopIteration: | |
pass | |
return self.cur | |
def read(self): | |
ret = self.peek() | |
self.cur = None | |
return ret | |
def accept(self): | |
if self.cur: | |
self.cur = None | |
self.peek() | |
def isconsonant(ph): | |
return ph and not ph in "AEIOU" | |
def syllablize(phs): | |
p = Parser(reversed(phs)) | |
ret = [] | |
cur = [] | |
while (ph := p.read()): | |
cur.insert(0, ph) | |
if isconsonant(ph): continue | |
c1 = p.peek() | |
if isconsonant(c1): | |
p.accept() | |
cur.insert(0, c1) | |
if isconsonant(c2 := p.peek()): | |
if c1 in "LR" and c2 != c1 and not c2 in "JW": | |
p.accept() | |
cur.insert(0, c2) | |
else: | |
cc = c2 + c1 | |
if cc in ["KW"]: | |
p.accept() | |
cur.insert(0, c2) | |
ret.insert(0, cur) | |
cur = [] | |
if cur: | |
if ret: | |
ret[0] = cur + ret[0] | |
else: | |
ret = [cur] | |
return ret | |
if testwords: | |
print("# syllablize") | |
for w in testwords: | |
print(w, "->", syllablize(getphoneme(w))) | |
def setaccent(syls): | |
ac = None | |
s1 = list(filter(lambda phs: "s1" in phs, syls)) | |
if s1: | |
ac = s1[0] | |
elif len(syls) >= 2: | |
last = "".join(syls[-1]) | |
for end in ["M", "N", "S", "D"]: | |
if last.endswith(end): | |
last = last[:-len(end)] | |
break | |
if last and isconsonant(last[-1]): | |
ac = syls[-1] | |
else: | |
for i in range(len(syls) - 1, 0, -1): | |
if isconsonant(syls[i][0]): | |
ac = syls[i - 1] | |
break | |
if not ac: ac = syls[0] | |
ac.insert(0, "s1") | |
elif syls and len(syls[0]) == 1: | |
ac = syls[0] | |
if ac and not isconsonant(ac[-1]): | |
ac.append("lng") | |
return syls | |
if testwords: | |
print("# setaccent") | |
for w in testwords: | |
print(w, "->", setaccent(syllablize(getphoneme(w)))) | |
def combine(syls): | |
return " . ".join(map(" ".join, syls)) | |
def getups(word): | |
syls = setaccent(syllablize(getphoneme(word))) | |
if lang in ["sk-SK"]: | |
for syl in syls: | |
if syl[-1] == "J": syl[-1] = "I" | |
ret = combine(syls) | |
if lang in ["ro-RO", "sk-SK", "ru-RU"]: | |
ret = ret.replace("W", "U") | |
return ret | |
if testwords: | |
print("# getups") | |
for w in testwords: | |
print(w, "->", getups(w)) | |
def readtoken(p): | |
ch = p.peek() | |
if not ch: return None | |
ret = "" | |
while (ch := p.peek()) and str.isalpha(ch) or ch == "-": | |
p.accept() | |
ret += ch | |
if ret: return (True, ret) | |
while (ch := p.peek()) and not str.isalpha(ch): | |
p.accept() | |
ret += ch | |
return (False, ret) | |
ssmlhdr = """ | |
<?xml version="1.0" encoding="UTF-8"?> | |
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="%s"> | |
""".lstrip() | |
def getssml(text): | |
ssml = ssmlhdr % lang | |
p = Parser(text) | |
while (t := readtoken(p)): | |
alpha, token = t | |
if alpha: | |
ssml += '<phoneme alphabet="ups" ph="%s">%s</phoneme>' % (getups(token), token) | |
else: | |
ssml += token | |
if not ssml.endswith("\n"): ssml += "\n" | |
return ssml + '</speak>' | |
if testwords: | |
print("# getssml") | |
text = "Nusen Patre, kel es in siele," | |
print(text) | |
print("-" * 32) | |
print(getssml(text)) | |
options = "l:f:" | |
def usage(): | |
print("usage: %s -l lang -f file | text ..." % sys.argv[0]) | |
exit(1) | |
if __name__ == "__main__": | |
text = None | |
try: | |
opts, args = getopt.getopt(sys.argv[1:], options) | |
except getopt.GetoptError as e: | |
print(e) | |
usage() | |
for opt, optarg in opts: | |
if opt == "-l": lang = optarg | |
elif opt == "-f": | |
with open(optarg, encoding="utf-8") as f: | |
text = f.read() | |
if lang == "ca-ES": phonemes["r"] = "rr" | |
if not text: text = " ".join(args) | |
if not text: usage() | |
print(getssml(text)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment