Last active
October 18, 2022 04:10
-
-
Save 7shi/49fca97c08eb06edb6bed423502c784e to your computer and use it in GitHub Desktop.
[py] SSML converter for Ido
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# CC0 http://creativecommons.org/publicdomain/zero/1.0/ | |
phonemes = {} | |
phonemes2 = {} | |
def setphonemes(phs): | |
for ph in phs.split(): | |
p1, p2 = ph.split(",") | |
phonemes[p1] = p2 | |
if len(p1) > 1: phonemes2[p1[0]] = 1; | |
setphonemes("a,A b,B c,TS ch,CH d,D e,E f,F g,G h,H") | |
setphonemes("i,I j,ZH k,K l,L m,M n,N o,O p,P qu,KW") | |
setphonemes("r,R s,S sh,SH t,T u,U v,V w,W x,KS y,J z,Z") | |
tests = [ | |
"mashino", "aquo", "linguo", "patro", "strato", | |
"serchar", "familio", "dio", "manuo", "frua"] | |
def getph(ch): | |
return phonemes[ch] if ch in phonemes else "" | |
class Parser: | |
def __init__(self, src): | |
self.i = iter(src) | |
self.cur = None | |
def peek(self): | |
if self.cur: return self.cur | |
try: | |
self.cur = next(self.i) | |
except StopIteration: | |
pass | |
return self.cur | |
def read(self): | |
ret = self.peek() | |
self.cur = None | |
return ret | |
def accept(self): | |
if self.cur: | |
self.cur = None | |
self.peek() | |
def getphoneme(word): | |
def g(): | |
p = Parser(word) | |
while ch := p.read(): | |
if ch in phonemes2: | |
ch2 = p.peek() | |
if ch2 and ch + ch2 in phonemes: | |
ch += p.read() | |
yield getph(ch) | |
return list(g()) | |
if False: | |
print("# getphoneme") | |
for w in tests: | |
print(w, "->", getphoneme(w)) | |
def isconsonant(ph): | |
return ph and not ph[0] in "AEIOU" | |
def syllablize(phs): | |
p = Parser(reversed(phs)) | |
ret = [] | |
cur = [] | |
while (ph := p.read()): | |
cur.insert(0, ph) | |
if isconsonant(ph): continue | |
c1 = p.peek() | |
if isconsonant(c1): | |
p.accept() | |
cur.insert(0, c1) | |
if isconsonant(c2 := p.peek()): | |
if c1 in "LR" and c2 != c1: | |
p.accept() | |
cur.insert(0, c2) | |
else: | |
cc = c2 + c1 | |
ret.insert(0, cur) | |
cur = [] | |
if cur: | |
if ret: | |
ret[0] = cur + ret[0] | |
else: | |
ret = [cur] | |
if len(ret) >= 3 and not isconsonant(ret[-1][0]) and ret[-2][-1] in "IU": # diphthong | |
last = ret.pop() | |
ret[-1] += last | |
return ret | |
if False: | |
print("# syllablize") | |
for w in tests: | |
print(w, "->", syllablize(getphoneme(w))) | |
def setaccent(syls): | |
if len(syls) >= 2: | |
last = syls[-1] | |
if len(last) >= 2 and last[-2] + last[-1] == "AR": # infinitive | |
syls[-1] = ["s1"] + last | |
else: | |
syls[-2] = ["s1"] + syls[-2] | |
if not isconsonant(syls[-2][-1]): | |
syls[-2].append("lng") | |
return syls | |
if False: | |
print("# setaccent") | |
for w in tests: | |
print(w, "->", setaccent(syllablize(getphoneme(w)))) | |
def combine(syls): | |
return " . ".join(map(" ".join, syls)) | |
def getups(word): | |
word = word.lower() | |
ret = combine(setaccent(syllablize(getphoneme(word)))) | |
return ret.replace("W", "U").replace("KU", "K U").replace("KS", "K S") | |
if False: | |
print("# getups") | |
for w in tests: | |
print(w, "->", getups(w)) | |
def readtoken(p): | |
ch = p.peek() | |
if not ch: return None | |
ret = "" | |
while (ch := p.peek()) and str.isalpha(ch) or ch == "'": | |
p.accept() | |
ret += ch | |
if ret: return (True, ret) | |
while (ch := p.peek()) and not str.isalpha(ch): | |
p.accept() | |
ret += ch | |
return (False, ret) | |
ssmlhdr = """ | |
<?xml version="1.0" encoding="UTF-8"?> | |
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="sk-SK"> | |
""".lstrip() | |
def getssml(text): | |
ssml = ssmlhdr | |
p = Parser(text) | |
while (t := readtoken(p)): | |
alpha, token = t | |
if alpha: | |
ssml += '<phoneme alphabet="ups" ph="%s">%s</phoneme>' % (getups(token), token) | |
else: | |
ssml += token | |
if not ssml.endswith("\n"): ssml += "\n" | |
return ssml + '</speak>' | |
if False: | |
print("# getssml") | |
text = "L'amiko serchas la familio." | |
print(text) | |
print("-" * 32) | |
print(getssml(text)) | |
import getopt, sys | |
options = "f:" | |
def usage(): | |
print("usage: %s -f file | text ..." % sys.argv[0]) | |
exit(1) | |
if __name__ == "__main__": | |
text = None | |
try: | |
opts, args = getopt.getopt(sys.argv[1:], options) | |
except getopt.GetoptError as e: | |
print(e) | |
usage() | |
for opt, optarg in opts: | |
if opt == "-f": | |
with open(optarg, encoding="utf-8") as f: | |
text = f.read() | |
if not text: text = " ".join(args) | |
if not text: usage() | |
print(getssml(text)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment