Last active
May 19, 2020 14:06
-
-
Save 7shi/adb49f584096d2c68d7839dd5a6f4fe7 to your computer and use it in GitHub Desktop.
[py] SSML converter for Interlingue/Occidental
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# CC0 http://creativecommons.org/publicdomain/zero/1.0/ | |
import getopt, re, sys | |
lang = "ro-RO" # ca-ES, pl-PL, ro-RO, sk-SK, sl-SI | |
testwords = [] #"intercalar forme familie rapidmen duplic café ínpossibil".split() | |
phonemes = {} | |
def setphonemes(phs): | |
for ph in phs.split(): | |
p1, p2 = ph.split(",") | |
phonemes[p1] = p2 | |
setphonemes("a,A b,B ć,TS d,D e,E f,F g,G h,H") | |
setphonemes("i,I j,ZH k,K l,L m,M n,N o,O p,P") | |
setphonemes("r,R s,S ŝ,SH t,T u,U v,V w,W y,J z,DZ ź,Z") | |
setphonemes("á,s1|A é,s1|E í,s1|I ó,s1|O ú,s1|U") | |
repls = [pair.split(",") for pair in """ | |
ch,ŝ ce,će ci,ći c,k qu,kw x,ks ge,je gi,ji ss,s | |
tia,ćia tie,ćie tio,ćio tiu,ćiu | |
""".strip().split()] | |
def getph(ch): | |
return phonemes[ch] if ch in phonemes else "" | |
def getphoneme(word): | |
ret = [] | |
word = re.sub("([aeiou])s([aeiou])", r"\1ź\2", word.lower()) | |
for a, b in repls: | |
word = word.replace(a, b) | |
for ch in word: | |
phs = getph(ch).split("|") | |
if phs: ret += phs | |
return ret | |
if testwords: | |
print("# getphoneme") | |
for w in testwords: | |
print(w, "->", getphoneme(w)) | |
class Parser: | |
def __init__(self, src): | |
self.i = iter(src) | |
self.cur = None | |
def peek(self): | |
if self.cur: return self.cur | |
try: | |
self.cur = next(self.i) | |
except StopIteration: | |
pass | |
return self.cur | |
def read(self): | |
ret = self.peek() | |
self.cur = None | |
return ret | |
def accept(self): | |
if self.cur: | |
self.cur = None | |
self.peek() | |
def isconsonant(ph): | |
return ph and not ph in "AEIOU" | |
def syllablize(phs): | |
p = Parser(reversed(phs)) | |
ret = [] | |
cur = [] | |
while (ph := p.read()): | |
cur.insert(0, ph) | |
if isconsonant(ph): continue | |
c1 = p.peek() | |
if c1 == "s1" or ((c1 == "I" or c1 == "U") and ph != c1): | |
p.accept() | |
cur.insert(0, c1) | |
c1 = p.peek() | |
if isconsonant(c1): | |
p.accept() | |
cur.insert(0, c1) | |
if isconsonant(c2 := p.peek()): | |
if c1 in "LR" and c2 != c1: | |
p.accept() | |
cur.insert(0, c2) | |
else: | |
cc = c2 + c1 | |
if cc in ["KW"]: | |
p.accept() | |
cur.insert(0, c2) | |
ret.insert(0, cur) | |
cur = [] | |
if cur: | |
if ret: | |
ret[0] = cur + ret[0] | |
else: | |
ret = [cur] | |
return ret | |
if testwords: | |
print("# syllablize") | |
for w in testwords: | |
print(w, "->", syllablize(getphoneme(w))) | |
def setaccent(syls): | |
ac = None | |
s1 = list(filter(lambda phs: "s1" in phs, syls)) | |
if s1: | |
ac = s1[0] | |
elif len(syls) >= 2: | |
last = "".join(syls[-1]) | |
if last.endswith("S"): last = last[:-1] | |
if last == "DIE": | |
ac = syls[-1] | |
elif isconsonant(last[-1]): | |
for sfx in ["BIL", "IK", "IM", "UL", "UM", "MEN"]: | |
if last.endswith(sfx): | |
ac = syls[-2] | |
break | |
if not ac: ac = syls[-1] | |
if not ac: ac = syls[-2] | |
ac.insert(0, "s1") | |
if ac and not isconsonant(ac[-1]): | |
ac.append("lng") | |
return syls | |
if testwords: | |
print("# setaccent") | |
for w in testwords: | |
print(w, "->", setaccent(syllablize(getphoneme(w)))) | |
def combine(syls): | |
return " . ".join(map(" ".join, syls)) | |
def getups(word): | |
ret = combine(setaccent(syllablize(getphoneme(word)))) | |
if lang in ["ro-RO", "sk-SK"]: ret = ret.replace("W", "U") | |
return ret | |
if testwords: | |
print("# getups") | |
for w in testwords: | |
print(w, "->", getups(w)) | |
def readtoken(p): | |
ch = p.peek() | |
if not ch: return None | |
ret = "" | |
while (ch := p.peek()) and str.isalpha(ch) or ch == "-": | |
p.accept() | |
ret += ch | |
if ret: return (True, ret) | |
while (ch := p.peek()) and not str.isalpha(ch): | |
p.accept() | |
ret += ch | |
return (False, ret) | |
ssmlhdr = """ | |
<?xml version="1.0" encoding="UTF-8"?> | |
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="%s"> | |
""".lstrip() | |
def getssml(text): | |
ssml = ssmlhdr % lang | |
p = Parser(text) | |
while (t := readtoken(p)): | |
alpha, token = t | |
if alpha: | |
ssml += '<phoneme alphabet="ups" ph="%s">%s</phoneme>' % (getups(token), token) | |
else: | |
ssml += token | |
if not ssml.endswith("\n"): ssml += "\n" | |
return ssml + '</speak>' | |
if testwords: | |
print("# getssml") | |
text = "Patre nor, qui es in li cieles," | |
print(text) | |
print("-" * 32) | |
print(getssml(text)) | |
options = "l:f:" | |
def usage(): | |
print("usage: %s -l lang -f file | text ..." % sys.argv[0]) | |
exit(1) | |
if __name__ == "__main__": | |
text = None | |
try: | |
opts, args = getopt.getopt(sys.argv[1:], options) | |
except getopt.GetoptError as e: | |
print(e) | |
usage() | |
for opt, optarg in opts: | |
if opt == "-l": lang = optarg | |
elif opt == "-f": | |
with open(optarg, encoding="utf-8") as f: | |
text = f.read() | |
if lang == "ca-ES": phonemes["r"] = "rr" | |
if not text: text = " ".join(args) | |
if not text: usage() | |
print(getssml(text)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment