Created
May 15, 2020 15:47
-
-
Save 7shi/92794319bebef470dd71e68d4e3e599d to your computer and use it in GitHub Desktop.
[py] SSML converter for Esperanto
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# CC0 http://creativecommons.org/publicdomain/zero/1.0/ | |
phonemes = {} | |
def setphonemes(phs): | |
for ph in phs.split(): | |
p1, p2 = ph.split(",") | |
phonemes[p1] = p2 | |
setphonemes("a,A b,B c,TS ĉ,CH d,D e,E f,F g,G ĝ,JH h,H") | |
setphonemes("ĥ,X i,I j,J ĵ,ZH k,K l,L m,M n,N o,O p,P") | |
setphonemes("r,R s,S ŝ,SH t,T u,U ŭ,W v,V z,Z") | |
def getph(ch): | |
return phonemes[ch] if ch in phonemes else "" | |
def getphoneme(word): | |
return [ph for ch in word if (ph := getph(ch))] | |
if False: | |
print("# getphoneme") | |
for w in ["feliĉa", "ĝis"]: | |
print(w, "->", getphoneme(w)) | |
class Parser: | |
def __init__(self, src): | |
self.i = iter(src) | |
self.cur = None | |
def peek(self): | |
if self.cur: return self.cur | |
try: | |
self.cur = next(self.i) | |
except StopIteration: | |
pass | |
return self.cur | |
def read(self): | |
ret = self.peek() | |
self.cur = None | |
return ret | |
def accept(self): | |
if self.cur: | |
self.cur = None | |
self.peek() | |
def isconsonant(ph): | |
return ph and not ph in "AEIOU" | |
def syllablize(phs): | |
p = Parser(reversed(phs)) | |
ret = [] | |
cur = [] | |
while (ph := p.read()): | |
cur.insert(0, ph) | |
if isconsonant(ph): continue | |
if isconsonant(c1 := p.peek()): | |
p.accept() | |
cur.insert(0, c1) | |
if isconsonant(c2 := p.peek()): | |
if c1 in "LR" and c2 != c1: | |
p.accept() | |
cur.insert(0, c2) | |
else: | |
cc = c2 + c1 | |
if cc in ["KV", "GV", "DZ"]: | |
p.accept() | |
cur.insert(0, c2) | |
ret.insert(0, cur) | |
cur = [] | |
if cur: | |
if ret: | |
ret[0] = cur + ret[0] | |
else: | |
ret = [cur] | |
return ret | |
if False: | |
print("# syllablize") | |
for w in ["feliĉa", "ĝis", "akvo", "edzo", "lingvo", "patro", "strato"]: | |
print(w, "->", syllablize(getphoneme(w))) | |
def setaccent(syls): | |
if len(syls) >= 2: | |
syls[-2] = ["s1"] + syls[-2] | |
if not isconsonant(syls[-2][-1]): | |
syls[-2].append("lng") | |
return syls | |
if False: | |
print("# setaccent") | |
for w in ["feliĉa", "ĝis", "akvo", "edzo", "lingvo", "patro", "strato"]: | |
print(w, "->", setaccent(syllablize(getphoneme(w)))) | |
def combine(syls): | |
return " . ".join(map(" ".join, syls)) | |
xsis = [pair.split(",") for pair in "cx,ĉ gx,ĝ hx,ĥ jx,ĵ sx,ŝ ux,ŭ".split()] | |
def getups(word): | |
word = word.lower() | |
for a, b in xsis: | |
word = word.replace(a, b) | |
return combine(setaccent(syllablize(getphoneme(word)))).replace("W", "U") | |
if False: | |
print("# getups") | |
for w in ["Felicxa", "Gxis", "akvo", "lingvo", "patro", "strato", "hodiaux"]: | |
print(w, "->", getups(w)) | |
def readtoken(p): | |
ch = p.peek() | |
if not ch: return None | |
ret = "" | |
while (ch := p.peek()) and str.isalpha(ch): | |
p.accept() | |
ret += ch | |
if ret: return (True, ret) | |
while (ch := p.peek()) and not str.isalpha(ch): | |
p.accept() | |
ret += ch | |
return (False, ret) | |
ssmlhdr = """ | |
<?xml version="1.0" encoding="UTF-8"?> | |
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="sk-SK"> | |
""".lstrip() | |
def getssml(text): | |
ssml = ssmlhdr | |
p = Parser(text) | |
while (t := readtoken(p)): | |
alpha, token = t | |
if alpha: | |
ssml += '<phoneme alphabet="ups" ph="%s">%s</phoneme>' % (getups(token), token) | |
else: | |
ssml += token | |
if not ssml.endswith("\n"): ssml += "\n" | |
return ssml + '</speak>' | |
if False: | |
print("# getssml") | |
text = "Mi estas tre ĝoja konatiĝi kun vi." | |
print(text) | |
print("-" * 32) | |
print(getssml(text)) | |
import getopt, sys | |
options = "f:" | |
def usage(): | |
print("usage: %s -f file | text ..." % sys.argv[0]) | |
exit(1) | |
if __name__ == "__main__": | |
text = None | |
try: | |
opts, args = getopt.getopt(sys.argv[1:], options) | |
except getopt.GetoptError as e: | |
print(e) | |
usage() | |
for opt, optarg in opts: | |
if opt == "-f": | |
with open(optarg, encoding="utf-8") as f: | |
text = f.read() | |
if not text: text = " ".join(args) | |
if not text: usage() | |
print(getssml(text)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
このスクリプトは以下の記事で解説しています。