Last active
March 17, 2020 15:31
-
-
Save antecedent/13c66f58c055dd3b3d3650aa78229355 to your computer and use it in GitHub Desktop.
Fixes (mostly redoes) Lithuanian phonological transcription from Wiktionary-derived data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import sys | |
import logging | |
def transcribe(spelling): | |
defaults = { | |
'a': '?A', # Either long/tense or short/lax; use Wiktionary transcription to resolve | |
'ą': 'aː', | |
'b': 'b', | |
'c': 'ts', | |
'č': 'tʃ', | |
'd': 'd', | |
'e': '?E', # Either long/tense or short/lax; use Wiktionary transcription to resolve | |
'ę': 'æː', | |
'ė': 'eː', | |
'f': 'f', | |
'g': 'ɡ', | |
'h': 'h', | |
'i': 'ɪ', | |
'į': 'iː', | |
'j': 'j', | |
'k': 'k', | |
'l': 'l', | |
'm': 'm', | |
'n': 'n', | |
'o': '?O', # Distinction between /oː/ and the loan-phoneme /ɔ/; also use Wiktionary data | |
'p': 'p', | |
'r': 'r', | |
's': 's', | |
'š': 'ʃ', | |
't': 't', | |
'u': 'ʊ', | |
'ų': 'uː', | |
'ū': 'uː', | |
'v': 'ʋ', | |
'y': 'iː', | |
'z': 'z', | |
'ž': 'ʒ', | |
} | |
palatalization = 'ʲ' | |
front_vowel_letters = set('iįyeėę') | |
back_vowel_letters = set('aąouųū') | |
consonant_letters = set('bcčdfghjklmnprsštvzž') | |
sibilant_letters = set('sšzž') | |
voiced_consonant_phonemes = ['b', 'd', 'ɡ', 'z', 'ʒ', 'dz', 'dʒ', 'h'] | |
voiceless_consonant_phonemes = ['p', 't', 'k', 's', 'ʃ', 'ts', 'tʃ', 'x'] | |
voiced_consonant_phonemes += [ph + palatalization for ph in voiced_consonant_phonemes] | |
voiceless_consonant_phonemes += [ph + palatalization for ph in voiceless_consonant_phonemes] | |
# 1. Flank with boundary sentinels | |
# 2. Reverse, so that regressive processes are easier to simulate | |
spelling = list(reversed('#' + spelling + '#')) | |
transcription = [] | |
def displace(with_what): | |
transcription[0] = with_what | |
def insert(what): | |
nonlocal transcription | |
transcription = [what] + transcription | |
# Flags for various types of regressive feature spreading | |
should_palatalize = False | |
should_voice = False | |
should_devoice = True # word-final devoicing | |
for input_ahead, input_now, input_behind in zip(spelling[:-2], spelling[1:-1], spelling[2:]): | |
output_ahead = transcription[0] if transcription else '#' | |
output_now = defaults[input_now] | |
coarticulation = palatalization if should_palatalize and input_now != 'j' else '' | |
if input_now in consonant_letters: | |
output_now += coarticulation | |
if should_voice and output_now in voiceless_consonant_phonemes: | |
output_now = voiced_consonant_phonemes[voiceless_consonant_phonemes.index(output_now)] | |
elif should_devoice and output_now in voiced_consonant_phonemes: | |
output_now = voiceless_consonant_phonemes[voiced_consonant_phonemes.index(output_now)] | |
should_voice = should_voice or output_now in voiced_consonant_phonemes | |
should_devoice = should_devoice or output_now in voiceless_consonant_phonemes | |
else: | |
should_voice = False | |
should_devoice = False | |
if input_now in back_vowel_letters: | |
should_palatalize = False | |
elif input_now in front_vowel_letters: | |
should_palatalize = True | |
# Digraphs | |
if (input_now, input_ahead) == ('c', 'h'): | |
displace('x' + coarticulation) | |
should_devoice = True | |
elif input_now == 'd' and output_ahead in ['z', 'zʲ', 'ʒ', 'ʒʲ']: | |
displace('d' + output_ahead) | |
elif (input_now, input_ahead) == ('i', 'e'): | |
displace('iə') | |
elif (input_now, input_ahead) == ('u', 'o'): | |
displace('uə') | |
# Collapse sibilant geminates and semi-geminates | |
elif input_now in sibilant_letters and input_ahead in sibilant_letters: | |
pass | |
# Collapse true geminates of other kinds | |
elif input_now == input_ahead and input_now in consonant_letters: | |
pass | |
# <i> before back vowels just marks that the preceding consonant cluster is palatalized | |
elif input_now == 'i' and input_ahead in back_vowel_letters: | |
should_palatalize = True | |
elif palatalization in output_now or output_now == 'j': | |
if output_ahead == '?A': | |
displace('?E') | |
elif output_ahead == 'aː': | |
displace('æː') | |
insert(output_now) | |
else: | |
insert(output_now) | |
return transcription | |
reader = csv.reader(open(sys.argv[1]), delimiter='\t') | |
for graphemes, wiktionary_phonemes in reader: | |
wiktionary_phonemes = wiktionary_phonemes.split() | |
low_vowel_disambiguation = [ph for ph in wiktionary_phonemes if ph in ['aː', 'æː', 'a', 'æ', 'ɐ', 'ɛ', 'ɑ', 'ɑː']] | |
mid_vowel_disambiguation = [ph for n, ph in enumerate(wiktionary_phonemes) \ | |
if ph in ['ɔ', 'oː', 'o'] and (n == 0 or wiktionary_phonemes[n - 1] not in ['ʊ', 'u'])] | |
our_phonemes = transcribe(graphemes) | |
should_skip = False | |
for n, ph in enumerate(our_phonemes): | |
# Unambiguously long low vowel | |
if ph in ['aː', 'æː']: | |
low_vowel_disambiguation = low_vowel_disambiguation[1:] | |
# Either /ɐ/ or /aː/ (also count /a, ɑ, ɑː/ as instances of the latter) | |
elif ph == '?A': | |
if not low_vowel_disambiguation: | |
should_skip = True | |
break | |
d, low_vowel_disambiguation = low_vowel_disambiguation[0], low_vowel_disambiguation[1:] | |
if d not in ['ɐ', 'aː', 'a', 'ɑ', 'ɑː']: | |
should_skip = True | |
break | |
if d in ['a', 'ɑ', 'ɑː']: | |
d = 'aː' | |
our_phonemes[n] = d | |
# Either /ɛ/ or /æː/ (also count /æ/ as an instance of the latter) | |
elif ph == '?E': | |
if not low_vowel_disambiguation: | |
should_skip = True | |
break | |
d, low_vowel_disambiguation = low_vowel_disambiguation[0], low_vowel_disambiguation[1:] | |
if d not in ['ɛ', 'æː', 'æ']: | |
should_skip = True | |
break | |
if d == 'æ': | |
d = 'æː' | |
our_phonemes[n] = d | |
# Either /ɔ/ or /oː/ (also count /o/ as an instance of the latter) | |
elif ph == '?O': | |
if not mid_vowel_disambiguation: | |
should_skip = True | |
break | |
d, mid_vowel_disambiguation = mid_vowel_disambiguation[0], mid_vowel_disambiguation[1:] | |
if d == 'o': | |
d = 'oː' | |
our_phonemes[n] = d | |
substrings = graphemes, ''.join(our_phonemes), ''.join(wiktionary_phonemes) | |
hiatus = [ | |
('ɪjæ', 'ia'), | |
('ɪjɛ', 'ie'), | |
('ɪjɛ', 'ia'), | |
('ɪjɔ', 'io'), | |
('ɪjʊ', 'iu'), | |
('ɛjɪ', 'ei'), | |
('ɐjɪ', 'ai'), | |
('ɔjɪ', 'oi'), | |
('ʊjɪ', 'ui') | |
] | |
if any(ph in ''.join(wiktionary_phonemes) and gr in graphemes for (ph, gr) in hiatus): | |
logging.warning(('The word <%s> might exhibit hiatus. Please verify if the transcription proposed by this ' + \ | |
'script, namely /%s/, preserves the original hiatus in Wiktionary\'s /%s/, if any.') % substrings) | |
if should_skip: | |
logging.warning('The script failed to disambiguate the vowels in <%s> and will omit this word.' % graphemes) | |
else: | |
print(graphemes + '\t' + ' '.join(our_phonemes)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment