7shi/ssml_ido.py

## ssml_ido.py
# CC0 http://creativecommons.org/publicdomain/zero/1.0/

phonemes = {}
phonemes2 = {}

def setphonemes(phs):
    for ph in phs.split():
        p1, p2 = ph.split(",")
        phonemes[p1] = p2
        if len(p1) > 1: phonemes2[p1[0]] = 1;

setphonemes("a,A b,B c,TS ch,CH d,D e,E f,F g,G h,H")
setphonemes("i,I j,ZH k,K l,L m,M n,N o,O p,P qu,KW")
setphonemes("r,R s,S sh,SH t,T u,U v,V w,W x,KS y,J z,Z")

tests = [
    "mashino", "aquo", "linguo", "patro", "strato",
    "serchar", "familio", "dio", "manuo", "frua"]

def getph(ch):
    return phonemes[ch] if ch in phonemes else ""

class Parser:
    def __init__(self, src):
        self.i = iter(src)
        self.cur = None

    def peek(self):
        if self.cur: return self.cur
        try:
            self.cur = next(self.i)
        except StopIteration:
            pass
        return self.cur

    def read(self):
        ret = self.peek()
        self.cur = None
        return ret

    def accept(self):
        if self.cur:
            self.cur = None
            self.peek()

def getphoneme(word):
    def g():
        p = Parser(word)
        while ch := p.read():
            if ch in phonemes2:
                ch2 = p.peek()
                if ch2 and ch + ch2 in phonemes:
                    ch += p.read()
            yield getph(ch)
    return list(g())

if False:
    print("# getphoneme")
    for w in tests:
        print(w, "->", getphoneme(w))

def isconsonant(ph):
    return ph and not ph[0] in "AEIOU"

def syllablize(phs):
    p = Parser(reversed(phs))
    ret = []
    cur = []
    while (ph := p.read()):
        cur.insert(0, ph)
        if isconsonant(ph): continue
        c1 = p.peek()
        if isconsonant(c1):
            p.accept()
            cur.insert(0, c1)
            if isconsonant(c2 := p.peek()):
                if c1 in "LR" and c2 != c1:
                    p.accept()
                    cur.insert(0, c2)
                else:
                    cc = c2 + c1
        ret.insert(0, cur)
        cur = []
    if cur:
        if ret:
            ret[0] = cur + ret[0]
        else:
            ret = [cur]
    if len(ret) >= 3 and not isconsonant(ret[-1][0]) and ret[-2][-1] in "IU": # diphthong
        last = ret.pop()
        ret[-1] += last
    return ret

if False:
    print("# syllablize")
    for w in tests:
        print(w, "->", syllablize(getphoneme(w)))

def setaccent(syls):
    if len(syls) >= 2:
        last = syls[-1]
        if len(last) >= 2 and last[-2] + last[-1] == "AR": # infinitive
            syls[-1] = ["s1"] + last
        else:
            syls[-2] = ["s1"] + syls[-2]
            if not isconsonant(syls[-2][-1]):
                syls[-2].append("lng")
    return syls

if False:
    print("# setaccent")
    for w in tests:
        print(w, "->", setaccent(syllablize(getphoneme(w))))

def combine(syls):
    return " . ".join(map(" ".join, syls))

def getups(word):
    word = word.lower()
    ret = combine(setaccent(syllablize(getphoneme(word))))
    return ret.replace("W", "U").replace("KU", "K U").replace("KS", "K S")

if False:
    print("# getups")
    for w in tests:
        print(w, "->", getups(w))

def readtoken(p):
    ch = p.peek()
    if not ch: return None
    ret = ""
    while (ch := p.peek()) and str.isalpha(ch) or ch == "'":
        p.accept()
        ret += ch
    if ret: return (True, ret)
    while (ch := p.peek()) and not str.isalpha(ch):
        p.accept()
        ret += ch
    return (False, ret)

ssmlhdr = """
<?xml version="1.0" encoding="UTF-8"?>
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="sk-SK">
""".lstrip()

def getssml(text):
    ssml = ssmlhdr
    p = Parser(text)
    while (t := readtoken(p)):
        alpha, token = t
        if alpha:
            ssml += '<phoneme alphabet="ups" ph="%s">%s</phoneme>' % (getups(token), token)
        else:
            ssml += token
    if not ssml.endswith("\n"): ssml += "\n"
    return ssml + '</speak>'

if False:
    print("# getssml")
    text = "L'amiko serchas la familio."
    print(text)
    print("-" * 32)
    print(getssml(text))

import getopt, sys

options = "f:"
def usage():
    print("usage: %s -f file | text ..." % sys.argv[0])
    exit(1)

if __name__ == "__main__":
    text = None
    try:
        opts, args = getopt.getopt(sys.argv[1:], options)
    except getopt.GetoptError as e:
        print(e)
        usage()
    for opt, optarg in opts:
        if opt == "-f":
            with open(optarg, encoding="utf-8") as f:
                text = f.read()
    if not text: text = " ".join(args)
    if not text: usage()

    print(getssml(text))
	# CC0 http://creativecommons.org/publicdomain/zero/1.0/

	phonemes = {}
	phonemes2 = {}

	def setphonemes(phs):
	for ph in phs.split():
	p1, p2 = ph.split(",")
	phonemes[p1] = p2
	if len(p1) > 1: phonemes2[p1[0]] = 1;

	setphonemes("a,A b,B c,TS ch,CH d,D e,E f,F g,G h,H")
	setphonemes("i,I j,ZH k,K l,L m,M n,N o,O p,P qu,KW")
	setphonemes("r,R s,S sh,SH t,T u,U v,V w,W x,KS y,J z,Z")

	tests = [
	"mashino", "aquo", "linguo", "patro", "strato",
	"serchar", "familio", "dio", "manuo", "frua"]

	def getph(ch):
	return phonemes[ch] if ch in phonemes else ""

	class Parser:
	def __init__(self, src):
	self.i = iter(src)
	self.cur = None

	def peek(self):
	if self.cur: return self.cur
	try:
	self.cur = next(self.i)
	except StopIteration:
	pass
	return self.cur

	def read(self):
	ret = self.peek()
	self.cur = None
	return ret

	def accept(self):
	if self.cur:
	self.cur = None
	self.peek()

	def getphoneme(word):
	def g():
	p = Parser(word)
	while ch := p.read():
	if ch in phonemes2:
	ch2 = p.peek()
	if ch2 and ch + ch2 in phonemes:
	ch += p.read()
	yield getph(ch)
	return list(g())

	if False:
	print("# getphoneme")
	for w in tests:
	print(w, "->", getphoneme(w))

	def isconsonant(ph):
	return ph and not ph[0] in "AEIOU"

	def syllablize(phs):
	p = Parser(reversed(phs))
	ret = []
	cur = []
	while (ph := p.read()):
	cur.insert(0, ph)
	if isconsonant(ph): continue
	c1 = p.peek()
	if isconsonant(c1):
	p.accept()
	cur.insert(0, c1)
	if isconsonant(c2 := p.peek()):
	if c1 in "LR" and c2 != c1:
	p.accept()
	cur.insert(0, c2)
	else:
	cc = c2 + c1
	ret.insert(0, cur)
	cur = []
	if cur:
	if ret:
	ret[0] = cur + ret[0]
	else:
	ret = [cur]
	if len(ret) >= 3 and not isconsonant(ret[-1][0]) and ret[-2][-1] in "IU": # diphthong
	last = ret.pop()
	ret[-1] += last
	return ret

	if False:
	print("# syllablize")
	for w in tests:
	print(w, "->", syllablize(getphoneme(w)))

	def setaccent(syls):
	if len(syls) >= 2:
	last = syls[-1]
	if len(last) >= 2 and last[-2] + last[-1] == "AR": # infinitive
	syls[-1] = ["s1"] + last
	else:
	syls[-2] = ["s1"] + syls[-2]
	if not isconsonant(syls[-2][-1]):
	syls[-2].append("lng")
	return syls

	if False:
	print("# setaccent")
	for w in tests:
	print(w, "->", setaccent(syllablize(getphoneme(w))))

	def combine(syls):
	return " . ".join(map(" ".join, syls))

	def getups(word):
	word = word.lower()
	ret = combine(setaccent(syllablize(getphoneme(word))))
	return ret.replace("W", "U").replace("KU", "K U").replace("KS", "K S")

	if False:
	print("# getups")
	for w in tests:
	print(w, "->", getups(w))

	def readtoken(p):
	ch = p.peek()
	if not ch: return None
	ret = ""
	while (ch := p.peek()) and str.isalpha(ch) or ch == "'":
	p.accept()
	ret += ch
	if ret: return (True, ret)
	while (ch := p.peek()) and not str.isalpha(ch):
	p.accept()
	ret += ch
	return (False, ret)

	ssmlhdr = """
	<?xml version="1.0" encoding="UTF-8"?>
	<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="sk-SK">
	""".lstrip()

	def getssml(text):
	ssml = ssmlhdr
	p = Parser(text)
	while (t := readtoken(p)):
	alpha, token = t
	if alpha:
	ssml += '<phoneme alphabet="ups" ph="%s">%s</phoneme>' % (getups(token), token)
	else:
	ssml += token
	if not ssml.endswith("\n"): ssml += "\n"
	return ssml + '</speak>'

	if False:
	print("# getssml")
	text = "L'amiko serchas la familio."
	print(text)
	print("-" * 32)
	print(getssml(text))

	import getopt, sys

	options = "f:"
	def usage():
	print("usage: %s -f file \| text ..." % sys.argv[0])
	exit(1)

	if __name__ == "__main__":
	text = None
	try:
	opts, args = getopt.getopt(sys.argv[1:], options)
	except getopt.GetoptError as e:
	print(e)
	usage()
	for opt, optarg in opts:
	if opt == "-f":
	with open(optarg, encoding="utf-8") as f:
	text = f.read()
	if not text: text = " ".join(args)
	if not text: usage()

	print(getssml(text))