-
-
Save kylebgorman/124909662f1abdab9a97ef06237c557d to your computer and use it in GitHub Desktop.
Spanish pronunciation rules in Pynini
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import Optional | |
from pynini import * | |
def _rule(tau: Fst, sigma: Fst) -> Fst: | |
return cdrewrite(tau, "", "", sigma).optimize() | |
def _right_rule(tau: Fst, right: Fst, sigma: Fst) -> Fst: | |
return cdrewrite(tau, "", right, sigma).optimize() | |
# Alphabet. | |
front_v = union("i", "e") | |
back_v = union("a", "o", "u") | |
ph_v = front_v | back_v | |
gr_v = union("á", "é", "í", "ó", "ú", "ý") | |
ph_c = union("b", "tʃ", "d", "f", "g", "j", "k", "l", "ʝ", "m", | |
"n", "ɲ", "p", "r", "ɾ", "R", "s", "t", "w", "x", "z") | |
gr_c = union("c", "h", "ñ", "q", "v", "y") | |
# Closure over the alphabet---neeeded to compile rewrite rules. | |
sigma = closure(ph_v | gr_v | ph_c | gr_c).optimize() | |
## START OF PART 4. | |
u_rules = _rule(string_map([ | |
# Digraphs. | |
("ch", "tʃ"), | |
("ll", "ʝ"), | |
("qu", "k"), | |
# Removes acute accents. | |
("á", "a"), | |
("é", "e"), | |
("í", "i"), | |
("ó", "o"), | |
("ú", "u"), | |
# Other unconditioned rules. | |
("v", "b"), | |
("ñ", "ɲ"), | |
] | |
), sigma) | |
# TODO(kbg): This is not quite right. | |
r_rules = ( | |
_rule(transducer("rr", "R"), sigma) @ | |
_rule(transducer("r", "ɾ"), sigma) @ | |
_rule(transducer("R", "r"), sigma) | |
).optimize() | |
b_rules = ( | |
_rule(transducer("x", "ks"), sigma) @ | |
_rule(transducer("j", "x"), sigma) @ | |
_rule(transducer(union("ý", "y"), "j"), sigma) @ | |
_rule(transducer("hi", "j"), sigma) @ | |
_rule(transducer("hu", "w"), sigma) @ | |
_rule(transducer("h", ""), sigma) | |
).optimize() | |
k_rules = ( | |
_right_rule(transducer("g", "x"), front_v, sigma) @ | |
_right_rule(transducer("gu", "g"), front_v, sigma) @ | |
_rule(transducer("gü", "gw"), sigma) @ | |
_right_rule(transducer("c", "s"), front_v, sigma) @ | |
_right_rule(transducer("cu", "kw"), ph_v, sigma) @ | |
_rule(transducer("c", "k"), sigma) | |
) | |
g2p = optimize(u_rules @ r_rules @ b_rules @ k_rules) | |
def _rewrite(s: str, rule: Fst) -> Optional[str]: | |
lattice = compose(s, rule) | |
if lattice.start() == NO_STATE_ID: | |
return | |
return shortestpath(lattice).stringify() | |
assert _rewrite("cuando", g2p) == "kwando" | |
assert _rewrite("hierba", g2p) == "jeɾba" | |
assert _rewrite("hacer", g2p) == "aseɾ" | |
assert _rewrite("llave", g2p) == "ʝabe" | |
assert _rewrite("pero", g2p) == "peɾo" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment