7shi/ssml_ile.py

## ssml_ile.py
# CC0 http://creativecommons.org/publicdomain/zero/1.0/

import getopt, re, sys

lang = "ro-RO" # ca-ES, pl-PL, ro-RO, sk-SK, sl-SI
testwords = [] #"intercalar forme familie rapidmen duplic café ínpossibil".split()

phonemes = {}

def setphonemes(phs):
    for ph in phs.split():
        p1, p2 = ph.split(",")
        phonemes[p1] = p2

setphonemes("a,A b,B ć,TS d,D e,E f,F g,G h,H")
setphonemes("i,I j,ZH k,K l,L m,M n,N o,O p,P")
setphonemes("r,R s,S ŝ,SH t,T u,U v,V w,W y,J z,DZ ź,Z")
setphonemes("á,s1|A é,s1|E í,s1|I ó,s1|O ú,s1|U")

repls = [pair.split(",") for pair in """
ch,ŝ ce,će ci,ći c,k qu,kw x,ks ge,je gi,ji ss,s
tia,ćia tie,ćie tio,ćio tiu,ćiu
""".strip().split()]

def getph(ch):
    return phonemes[ch] if ch in phonemes else ""

def getphoneme(word):
    ret = []
    word = re.sub("([aeiou])s([aeiou])", r"\1ź\2", word.lower())
    for a, b in repls:
        word = word.replace(a, b)
    for ch in word:
        phs = getph(ch).split("|")
        if phs: ret += phs
    return ret

if testwords:
    print("# getphoneme")
    for w in testwords:
        print(w, "->", getphoneme(w))

class Parser:
    def __init__(self, src):
        self.i = iter(src)
        self.cur = None

    def peek(self):
        if self.cur: return self.cur
        try:
            self.cur = next(self.i)
        except StopIteration:
            pass
        return self.cur

    def read(self):
        ret = self.peek()
        self.cur = None
        return ret

    def accept(self):
        if self.cur:
            self.cur = None
            self.peek()

def isconsonant(ph):
    return ph and not ph in "AEIOU"

def syllablize(phs):
    p = Parser(reversed(phs))
    ret = []
    cur = []
    while (ph := p.read()):
        cur.insert(0, ph)
        if isconsonant(ph): continue
        c1 = p.peek()
        if c1 == "s1" or ((c1 == "I" or c1 == "U") and ph != c1):
            p.accept()
            cur.insert(0, c1)
            c1 = p.peek()
        if isconsonant(c1):
            p.accept()
            cur.insert(0, c1)
            if isconsonant(c2 := p.peek()):
                if c1 in "LR" and c2 != c1:
                    p.accept()
                    cur.insert(0, c2)
                else:
                    cc = c2 + c1
                    if cc in ["KW"]:
                        p.accept()
                        cur.insert(0, c2)
        ret.insert(0, cur)
        cur = []
    if cur:
        if ret:
            ret[0] = cur + ret[0]
        else:
            ret = [cur]
    return ret

if testwords:
    print("# syllablize")
    for w in testwords:
        print(w, "->", syllablize(getphoneme(w)))

def setaccent(syls):
    ac = None
    s1 = list(filter(lambda phs: "s1" in phs, syls))
    if s1:
        ac = s1[0]
    elif len(syls) >= 2:
        last = "".join(syls[-1])
        if last.endswith("S"): last = last[:-1]
        if last == "DIE":
            ac = syls[-1]
        elif isconsonant(last[-1]):
            for sfx in ["BIL", "IK", "IM", "UL", "UM", "MEN"]:
                if last.endswith(sfx):
                    ac = syls[-2]
                    break
            if not ac: ac = syls[-1]
        if not ac: ac = syls[-2]
        ac.insert(0, "s1")
    if ac and not isconsonant(ac[-1]):
        ac.append("lng")
    return syls

if testwords:
    print("# setaccent")
    for w in testwords:
        print(w, "->", setaccent(syllablize(getphoneme(w))))

def combine(syls):
    return " . ".join(map(" ".join, syls))

def getups(word):
    ret = combine(setaccent(syllablize(getphoneme(word))))
    if lang in ["ro-RO", "sk-SK"]: ret = ret.replace("W", "U")
    return ret

if testwords:
    print("# getups")
    for w in testwords:
        print(w, "->", getups(w))

def readtoken(p):
    ch = p.peek()
    if not ch: return None
    ret = ""
    while (ch := p.peek()) and str.isalpha(ch) or ch == "-":
        p.accept()
        ret += ch
    if ret: return (True, ret)
    while (ch := p.peek()) and not str.isalpha(ch):
        p.accept()
        ret += ch
    return (False, ret)

ssmlhdr = """
<?xml version="1.0" encoding="UTF-8"?>
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="%s">
""".lstrip()

def getssml(text):
    ssml = ssmlhdr % lang
    p = Parser(text)
    while (t := readtoken(p)):
        alpha, token = t
        if alpha:
            ssml += '<phoneme alphabet="ups" ph="%s">%s</phoneme>' % (getups(token), token)
        else:
            ssml += token
    if not ssml.endswith("\n"): ssml += "\n"
    return ssml + '</speak>'

if testwords:
    print("# getssml")
    text = "Patre nor, qui es in li cieles,"
    print(text)
    print("-" * 32)
    print(getssml(text))

options = "l:f:"
def usage():
    print("usage: %s -l lang -f file | text ..." % sys.argv[0])
    exit(1)

if __name__ == "__main__":
    text = None
    try:
        opts, args = getopt.getopt(sys.argv[1:], options)
    except getopt.GetoptError as e:
        print(e)
        usage()
    for opt, optarg in opts:
        if   opt == "-l": lang = optarg
        elif opt == "-f":
            with open(optarg, encoding="utf-8") as f:
                text = f.read()
    if lang == "ca-ES": phonemes["r"] = "rr"
    if not text: text = " ".join(args)
    if not text: usage()

    print(getssml(text))
	# CC0 http://creativecommons.org/publicdomain/zero/1.0/

	import getopt, re, sys

	lang = "ro-RO" # ca-ES, pl-PL, ro-RO, sk-SK, sl-SI
	testwords = [] #"intercalar forme familie rapidmen duplic café ínpossibil".split()

	phonemes = {}

	def setphonemes(phs):
	for ph in phs.split():
	p1, p2 = ph.split(",")
	phonemes[p1] = p2

	setphonemes("a,A b,B ć,TS d,D e,E f,F g,G h,H")
	setphonemes("i,I j,ZH k,K l,L m,M n,N o,O p,P")
	setphonemes("r,R s,S ŝ,SH t,T u,U v,V w,W y,J z,DZ ź,Z")
	setphonemes("á,s1\|A é,s1\|E í,s1\|I ó,s1\|O ú,s1\|U")

	repls = [pair.split(",") for pair in """
	ch,ŝ ce,će ci,ći c,k qu,kw x,ks ge,je gi,ji ss,s
	tia,ćia tie,ćie tio,ćio tiu,ćiu
	""".strip().split()]

	def getph(ch):
	return phonemes[ch] if ch in phonemes else ""

	def getphoneme(word):
	ret = []
	word = re.sub("([aeiou])s([aeiou])", r"\1ź\2", word.lower())
	for a, b in repls:
	word = word.replace(a, b)
	for ch in word:
	phs = getph(ch).split("\|")
	if phs: ret += phs
	return ret

	if testwords:
	print("# getphoneme")
	for w in testwords:
	print(w, "->", getphoneme(w))

	class Parser:
	def __init__(self, src):
	self.i = iter(src)
	self.cur = None

	def peek(self):
	if self.cur: return self.cur
	try:
	self.cur = next(self.i)
	except StopIteration:
	pass
	return self.cur

	def read(self):
	ret = self.peek()
	self.cur = None
	return ret

	def accept(self):
	if self.cur:
	self.cur = None
	self.peek()

	def isconsonant(ph):
	return ph and not ph in "AEIOU"

	def syllablize(phs):
	p = Parser(reversed(phs))
	ret = []
	cur = []
	while (ph := p.read()):
	cur.insert(0, ph)
	if isconsonant(ph): continue
	c1 = p.peek()
	if c1 == "s1" or ((c1 == "I" or c1 == "U") and ph != c1):
	p.accept()
	cur.insert(0, c1)
	c1 = p.peek()
	if isconsonant(c1):
	p.accept()
	cur.insert(0, c1)
	if isconsonant(c2 := p.peek()):
	if c1 in "LR" and c2 != c1:
	p.accept()
	cur.insert(0, c2)
	else:
	cc = c2 + c1
	if cc in ["KW"]:
	p.accept()
	cur.insert(0, c2)
	ret.insert(0, cur)
	cur = []
	if cur:
	if ret:
	ret[0] = cur + ret[0]
	else:
	ret = [cur]
	return ret

	if testwords:
	print("# syllablize")
	for w in testwords:
	print(w, "->", syllablize(getphoneme(w)))

	def setaccent(syls):
	ac = None
	s1 = list(filter(lambda phs: "s1" in phs, syls))
	if s1:
	ac = s1[0]
	elif len(syls) >= 2:
	last = "".join(syls[-1])
	if last.endswith("S"): last = last[:-1]
	if last == "DIE":
	ac = syls[-1]
	elif isconsonant(last[-1]):
	for sfx in ["BIL", "IK", "IM", "UL", "UM", "MEN"]:
	if last.endswith(sfx):
	ac = syls[-2]
	break
	if not ac: ac = syls[-1]
	if not ac: ac = syls[-2]
	ac.insert(0, "s1")
	if ac and not isconsonant(ac[-1]):
	ac.append("lng")
	return syls

	if testwords:
	print("# setaccent")
	for w in testwords:
	print(w, "->", setaccent(syllablize(getphoneme(w))))

	def combine(syls):
	return " . ".join(map(" ".join, syls))

	def getups(word):
	ret = combine(setaccent(syllablize(getphoneme(word))))
	if lang in ["ro-RO", "sk-SK"]: ret = ret.replace("W", "U")
	return ret

	if testwords:
	print("# getups")
	for w in testwords:
	print(w, "->", getups(w))

	def readtoken(p):
	ch = p.peek()
	if not ch: return None
	ret = ""
	while (ch := p.peek()) and str.isalpha(ch) or ch == "-":
	p.accept()
	ret += ch
	if ret: return (True, ret)
	while (ch := p.peek()) and not str.isalpha(ch):
	p.accept()
	ret += ch
	return (False, ret)

	ssmlhdr = """
	<?xml version="1.0" encoding="UTF-8"?>
	<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="%s">
	""".lstrip()

	def getssml(text):
	ssml = ssmlhdr % lang
	p = Parser(text)
	while (t := readtoken(p)):
	alpha, token = t
	if alpha:
	ssml += '<phoneme alphabet="ups" ph="%s">%s</phoneme>' % (getups(token), token)
	else:
	ssml += token
	if not ssml.endswith("\n"): ssml += "\n"
	return ssml + '</speak>'

	if testwords:
	print("# getssml")
	text = "Patre nor, qui es in li cieles,"
	print(text)
	print("-" * 32)
	print(getssml(text))

	options = "l:f:"
	def usage():
	print("usage: %s -l lang -f file \| text ..." % sys.argv[0])
	exit(1)

	if __name__ == "__main__":
	text = None
	try:
	opts, args = getopt.getopt(sys.argv[1:], options)
	except getopt.GetoptError as e:
	print(e)
	usage()
	for opt, optarg in opts:
	if opt == "-l": lang = optarg
	elif opt == "-f":
	with open(optarg, encoding="utf-8") as f:
	text = f.read()
	if lang == "ca-ES": phonemes["r"] = "rr"
	if not text: text = " ".join(args)
	if not text: usage()

	print(getssml(text))