7shi/ssml_nov.py

## ssml_nov.py
# CC0 http://creativecommons.org/publicdomain/zero/1.0/

import getopt, re, sys

lang = "sk-SK" # ca-ES, cs-CZ, de-CH, de-DE, hr-HR, hu-HU, pl-PL, ro-RO, ru-RU, sk-SK, sl-SI
testwords = [] #"nusen patre intestines familie lause".split()

phonemes = {}

def setphonemes(phs):
    for ph in phs.split():
        p1, p2 = ph.split(",")
        phonemes[p1] = p2

setphonemes("a,A b,B c,TS d,D e,E f,F g,G h,H")
setphonemes("i,I j,ZH k,K l,L m,M n,N o,O p,P")
setphonemes("r,R s,S ŝ,SH t,T u,U v,V w,W y,J z,Z")

repls = [pair.split(",") for pair in """
ch,ŝ ce,se ci,si c,k qu,kw sh,ŝ x,ks
au,aw eu,ew ai,ay ei,ey ie,ye
""".strip().split()]

def getph(ch):
    return phonemes[ch] if ch in phonemes else ""

def getphoneme(word):
    ret = []
    word = word.lower()
    for a, b in repls:
        word = word.replace(a, b)
    for ch in word:
        phs = getph(ch).split("|")
        if phs: ret += phs
    return ret

if testwords:
    print("# getphoneme")
    for w in testwords:
        print(w, "->", getphoneme(w))

class Parser:
    def __init__(self, src):
        self.i = iter(src)
        self.cur = None

    def peek(self):
        if self.cur: return self.cur
        try:
            self.cur = next(self.i)
        except StopIteration:
            pass
        return self.cur

    def read(self):
        ret = self.peek()
        self.cur = None
        return ret

    def accept(self):
        if self.cur:
            self.cur = None
            self.peek()

def isconsonant(ph):
    return ph and not ph in "AEIOU"

def syllablize(phs):
    p = Parser(reversed(phs))
    ret = []
    cur = []
    while (ph := p.read()):
        cur.insert(0, ph)
        if isconsonant(ph): continue
        c1 = p.peek()
        if isconsonant(c1):
            p.accept()
            cur.insert(0, c1)
            if isconsonant(c2 := p.peek()):
                if c1 in "LR" and c2 != c1 and not c2 in "JW":
                    p.accept()
                    cur.insert(0, c2)
                else:
                    cc = c2 + c1
                    if cc in ["KW"]:
                        p.accept()
                        cur.insert(0, c2)
        ret.insert(0, cur)
        cur = []
    if cur:
        if ret:
            ret[0] = cur + ret[0]
        else:
            ret = [cur]
    return ret

if testwords:
    print("# syllablize")
    for w in testwords:
        print(w, "->", syllablize(getphoneme(w)))

def setaccent(syls):
    ac = None
    s1 = list(filter(lambda phs: "s1" in phs, syls))
    if s1:
        ac = s1[0]
    elif len(syls) >= 2:
        last = "".join(syls[-1])
        for end in ["M", "N", "S", "D"]:
            if last.endswith(end):
                last = last[:-len(end)]
                break
        if last and isconsonant(last[-1]):
            ac = syls[-1]
        else:
            for i in range(len(syls) - 1, 0, -1):
                if isconsonant(syls[i][0]):
                    ac = syls[i - 1]
                    break
            if not ac: ac = syls[0]
        ac.insert(0, "s1")
    elif syls and len(syls[0]) == 1:
        ac = syls[0]
    if ac and not isconsonant(ac[-1]):
        ac.append("lng")
    return syls

if testwords:
    print("# setaccent")
    for w in testwords:
        print(w, "->", setaccent(syllablize(getphoneme(w))))

def combine(syls):
    return " . ".join(map(" ".join, syls))

def getups(word):
    syls = setaccent(syllablize(getphoneme(word)))
    if lang in ["sk-SK"]:
        for syl in syls:
            if syl[-1] == "J": syl[-1] = "I"
    ret = combine(syls)
    if lang in ["ro-RO", "sk-SK", "ru-RU"]:
        ret = ret.replace("W", "U")
    return ret

if testwords:
    print("# getups")
    for w in testwords:
        print(w, "->", getups(w))

def readtoken(p):
    ch = p.peek()
    if not ch: return None
    ret = ""
    while (ch := p.peek()) and str.isalpha(ch) or ch == "-":
        p.accept()
        ret += ch
    if ret: return (True, ret)
    while (ch := p.peek()) and not str.isalpha(ch):
        p.accept()
        ret += ch
    return (False, ret)

ssmlhdr = """
<?xml version="1.0" encoding="UTF-8"?>
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="%s">
""".lstrip()

def getssml(text):
    ssml = ssmlhdr % lang
    p = Parser(text)
    while (t := readtoken(p)):
        alpha, token = t
        if alpha:
            ssml += '<phoneme alphabet="ups" ph="%s">%s</phoneme>' % (getups(token), token)
        else:
            ssml += token
    if not ssml.endswith("\n"): ssml += "\n"
    return ssml + '</speak>'

if testwords:
    print("# getssml")
    text = "Nusen Patre, kel es in siele,"
    print(text)
    print("-" * 32)
    print(getssml(text))

options = "l:f:"
def usage():
    print("usage: %s -l lang -f file | text ..." % sys.argv[0])
    exit(1)

if __name__ == "__main__":
    text = None
    try:
        opts, args = getopt.getopt(sys.argv[1:], options)
    except getopt.GetoptError as e:
        print(e)
        usage()
    for opt, optarg in opts:
        if   opt == "-l": lang = optarg
        elif opt == "-f":
            with open(optarg, encoding="utf-8") as f:
                text = f.read()
    if lang == "ca-ES": phonemes["r"] = "rr"
    if not text: text = " ".join(args)
    if not text: usage()

    print(getssml(text))
	# CC0 http://creativecommons.org/publicdomain/zero/1.0/

	import getopt, re, sys

	lang = "sk-SK" # ca-ES, cs-CZ, de-CH, de-DE, hr-HR, hu-HU, pl-PL, ro-RO, ru-RU, sk-SK, sl-SI
	testwords = [] #"nusen patre intestines familie lause".split()

	phonemes = {}

	def setphonemes(phs):
	for ph in phs.split():
	p1, p2 = ph.split(",")
	phonemes[p1] = p2

	setphonemes("a,A b,B c,TS d,D e,E f,F g,G h,H")
	setphonemes("i,I j,ZH k,K l,L m,M n,N o,O p,P")
	setphonemes("r,R s,S ŝ,SH t,T u,U v,V w,W y,J z,Z")

	repls = [pair.split(",") for pair in """
	ch,ŝ ce,se ci,si c,k qu,kw sh,ŝ x,ks
	au,aw eu,ew ai,ay ei,ey ie,ye
	""".strip().split()]

	def getph(ch):
	return phonemes[ch] if ch in phonemes else ""

	def getphoneme(word):
	ret = []
	word = word.lower()
	for a, b in repls:
	word = word.replace(a, b)
	for ch in word:
	phs = getph(ch).split("\|")
	if phs: ret += phs
	return ret

	if testwords:
	print("# getphoneme")
	for w in testwords:
	print(w, "->", getphoneme(w))

	class Parser:
	def __init__(self, src):
	self.i = iter(src)
	self.cur = None

	def peek(self):
	if self.cur: return self.cur
	try:
	self.cur = next(self.i)
	except StopIteration:
	pass
	return self.cur

	def read(self):
	ret = self.peek()
	self.cur = None
	return ret

	def accept(self):
	if self.cur:
	self.cur = None
	self.peek()

	def isconsonant(ph):
	return ph and not ph in "AEIOU"

	def syllablize(phs):
	p = Parser(reversed(phs))
	ret = []
	cur = []
	while (ph := p.read()):
	cur.insert(0, ph)
	if isconsonant(ph): continue
	c1 = p.peek()
	if isconsonant(c1):
	p.accept()
	cur.insert(0, c1)
	if isconsonant(c2 := p.peek()):
	if c1 in "LR" and c2 != c1 and not c2 in "JW":
	p.accept()
	cur.insert(0, c2)
	else:
	cc = c2 + c1
	if cc in ["KW"]:
	p.accept()
	cur.insert(0, c2)
	ret.insert(0, cur)
	cur = []
	if cur:
	if ret:
	ret[0] = cur + ret[0]
	else:
	ret = [cur]
	return ret

	if testwords:
	print("# syllablize")
	for w in testwords:
	print(w, "->", syllablize(getphoneme(w)))

	def setaccent(syls):
	ac = None
	s1 = list(filter(lambda phs: "s1" in phs, syls))
	if s1:
	ac = s1[0]
	elif len(syls) >= 2:
	last = "".join(syls[-1])
	for end in ["M", "N", "S", "D"]:
	if last.endswith(end):
	last = last[:-len(end)]
	break
	if last and isconsonant(last[-1]):
	ac = syls[-1]
	else:
	for i in range(len(syls) - 1, 0, -1):
	if isconsonant(syls[i][0]):
	ac = syls[i - 1]
	break
	if not ac: ac = syls[0]
	ac.insert(0, "s1")
	elif syls and len(syls[0]) == 1:
	ac = syls[0]
	if ac and not isconsonant(ac[-1]):
	ac.append("lng")
	return syls

	if testwords:
	print("# setaccent")
	for w in testwords:
	print(w, "->", setaccent(syllablize(getphoneme(w))))

	def combine(syls):
	return " . ".join(map(" ".join, syls))

	def getups(word):
	syls = setaccent(syllablize(getphoneme(word)))
	if lang in ["sk-SK"]:
	for syl in syls:
	if syl[-1] == "J": syl[-1] = "I"
	ret = combine(syls)
	if lang in ["ro-RO", "sk-SK", "ru-RU"]:
	ret = ret.replace("W", "U")
	return ret

	if testwords:
	print("# getups")
	for w in testwords:
	print(w, "->", getups(w))

	def readtoken(p):
	ch = p.peek()
	if not ch: return None
	ret = ""
	while (ch := p.peek()) and str.isalpha(ch) or ch == "-":
	p.accept()
	ret += ch
	if ret: return (True, ret)
	while (ch := p.peek()) and not str.isalpha(ch):
	p.accept()
	ret += ch
	return (False, ret)

	ssmlhdr = """
	<?xml version="1.0" encoding="UTF-8"?>
	<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="%s">
	""".lstrip()

	def getssml(text):
	ssml = ssmlhdr % lang
	p = Parser(text)
	while (t := readtoken(p)):
	alpha, token = t
	if alpha:
	ssml += '<phoneme alphabet="ups" ph="%s">%s</phoneme>' % (getups(token), token)
	else:
	ssml += token
	if not ssml.endswith("\n"): ssml += "\n"
	return ssml + '</speak>'

	if testwords:
	print("# getssml")
	text = "Nusen Patre, kel es in siele,"
	print(text)
	print("-" * 32)
	print(getssml(text))

	options = "l:f:"
	def usage():
	print("usage: %s -l lang -f file \| text ..." % sys.argv[0])
	exit(1)

	if __name__ == "__main__":
	text = None
	try:
	opts, args = getopt.getopt(sys.argv[1:], options)
	except getopt.GetoptError as e:
	print(e)
	usage()
	for opt, optarg in opts:
	if opt == "-l": lang = optarg
	elif opt == "-f":
	with open(optarg, encoding="utf-8") as f:
	text = f.read()
	if lang == "ca-ES": phonemes["r"] = "rr"
	if not text: text = " ".join(args)
	if not text: usage()

	print(getssml(text))