Skip to content

Instantly share code, notes, and snippets.

@antecedent
Last active March 17, 2020 15:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save antecedent/13c66f58c055dd3b3d3650aa78229355 to your computer and use it in GitHub Desktop.
Save antecedent/13c66f58c055dd3b3d3650aa78229355 to your computer and use it in GitHub Desktop.
Fixes (mostly redoes) Lithuanian phonological transcription from Wiktionary-derived data
import csv
import sys
import logging
def transcribe(spelling):
defaults = {
'a': '?A', # Either long/tense or short/lax; use Wiktionary transcription to resolve
'ą': 'aː',
'b': 'b',
'c': 'ts',
'č': 'tʃ',
'd': 'd',
'e': '?E', # Either long/tense or short/lax; use Wiktionary transcription to resolve
'ę': 'æː',
'ė': 'eː',
'f': 'f',
'g': 'ɡ',
'h': 'h',
'i': 'ɪ',
'į': 'iː',
'j': 'j',
'k': 'k',
'l': 'l',
'm': 'm',
'n': 'n',
'o': '?O', # Distinction between /oː/ and the loan-phoneme /ɔ/; also use Wiktionary data
'p': 'p',
'r': 'r',
's': 's',
'š': 'ʃ',
't': 't',
'u': 'ʊ',
'ų': 'uː',
'ū': 'uː',
'v': 'ʋ',
'y': 'iː',
'z': 'z',
'ž': 'ʒ',
}
palatalization = 'ʲ'
front_vowel_letters = set('iįyeėę')
back_vowel_letters = set('aąouųū')
consonant_letters = set('bcčdfghjklmnprsštvzž')
sibilant_letters = set('sšzž')
voiced_consonant_phonemes = ['b', 'd', 'ɡ', 'z', 'ʒ', 'dz', 'dʒ', 'h']
voiceless_consonant_phonemes = ['p', 't', 'k', 's', 'ʃ', 'ts', 'tʃ', 'x']
voiced_consonant_phonemes += [ph + palatalization for ph in voiced_consonant_phonemes]
voiceless_consonant_phonemes += [ph + palatalization for ph in voiceless_consonant_phonemes]
# 1. Flank with boundary sentinels
# 2. Reverse, so that regressive processes are easier to simulate
spelling = list(reversed('#' + spelling + '#'))
transcription = []
def displace(with_what):
transcription[0] = with_what
def insert(what):
nonlocal transcription
transcription = [what] + transcription
# Flags for various types of regressive feature spreading
should_palatalize = False
should_voice = False
should_devoice = True # word-final devoicing
for input_ahead, input_now, input_behind in zip(spelling[:-2], spelling[1:-1], spelling[2:]):
output_ahead = transcription[0] if transcription else '#'
output_now = defaults[input_now]
coarticulation = palatalization if should_palatalize and input_now != 'j' else ''
if input_now in consonant_letters:
output_now += coarticulation
if should_voice and output_now in voiceless_consonant_phonemes:
output_now = voiced_consonant_phonemes[voiceless_consonant_phonemes.index(output_now)]
elif should_devoice and output_now in voiced_consonant_phonemes:
output_now = voiceless_consonant_phonemes[voiced_consonant_phonemes.index(output_now)]
should_voice = should_voice or output_now in voiced_consonant_phonemes
should_devoice = should_devoice or output_now in voiceless_consonant_phonemes
else:
should_voice = False
should_devoice = False
if input_now in back_vowel_letters:
should_palatalize = False
elif input_now in front_vowel_letters:
should_palatalize = True
# Digraphs
if (input_now, input_ahead) == ('c', 'h'):
displace('x' + coarticulation)
should_devoice = True
elif input_now == 'd' and output_ahead in ['z', 'zʲ', 'ʒ', 'ʒʲ']:
displace('d' + output_ahead)
elif (input_now, input_ahead) == ('i', 'e'):
displace('iə')
elif (input_now, input_ahead) == ('u', 'o'):
displace('uə')
# Collapse sibilant geminates and semi-geminates
elif input_now in sibilant_letters and input_ahead in sibilant_letters:
pass
# Collapse true geminates of other kinds
elif input_now == input_ahead and input_now in consonant_letters:
pass
# <i> before back vowels just marks that the preceding consonant cluster is palatalized
elif input_now == 'i' and input_ahead in back_vowel_letters:
should_palatalize = True
elif palatalization in output_now or output_now == 'j':
if output_ahead == '?A':
displace('?E')
elif output_ahead == 'aː':
displace('æː')
insert(output_now)
else:
insert(output_now)
return transcription
reader = csv.reader(open(sys.argv[1]), delimiter='\t')
for graphemes, wiktionary_phonemes in reader:
wiktionary_phonemes = wiktionary_phonemes.split()
low_vowel_disambiguation = [ph for ph in wiktionary_phonemes if ph in ['aː', 'æː', 'a', 'æ', 'ɐ', 'ɛ', 'ɑ', 'ɑː']]
mid_vowel_disambiguation = [ph for n, ph in enumerate(wiktionary_phonemes) \
if ph in ['ɔ', 'oː', 'o'] and (n == 0 or wiktionary_phonemes[n - 1] not in ['ʊ', 'u'])]
our_phonemes = transcribe(graphemes)
should_skip = False
for n, ph in enumerate(our_phonemes):
# Unambiguously long low vowel
if ph in ['aː', 'æː']:
low_vowel_disambiguation = low_vowel_disambiguation[1:]
# Either /ɐ/ or /aː/ (also count /a, ɑ, ɑː/ as instances of the latter)
elif ph == '?A':
if not low_vowel_disambiguation:
should_skip = True
break
d, low_vowel_disambiguation = low_vowel_disambiguation[0], low_vowel_disambiguation[1:]
if d not in ['ɐ', 'aː', 'a', 'ɑ', 'ɑː']:
should_skip = True
break
if d in ['a', 'ɑ', 'ɑː']:
d = 'aː'
our_phonemes[n] = d
# Either /ɛ/ or /æː/ (also count /æ/ as an instance of the latter)
elif ph == '?E':
if not low_vowel_disambiguation:
should_skip = True
break
d, low_vowel_disambiguation = low_vowel_disambiguation[0], low_vowel_disambiguation[1:]
if d not in ['ɛ', 'æː', 'æ']:
should_skip = True
break
if d == 'æ':
d = 'æː'
our_phonemes[n] = d
# Either /ɔ/ or /oː/ (also count /o/ as an instance of the latter)
elif ph == '?O':
if not mid_vowel_disambiguation:
should_skip = True
break
d, mid_vowel_disambiguation = mid_vowel_disambiguation[0], mid_vowel_disambiguation[1:]
if d == 'o':
d = 'oː'
our_phonemes[n] = d
substrings = graphemes, ''.join(our_phonemes), ''.join(wiktionary_phonemes)
hiatus = [
('ɪjæ', 'ia'),
('ɪjɛ', 'ie'),
('ɪjɛ', 'ia'),
('ɪjɔ', 'io'),
('ɪjʊ', 'iu'),
('ɛjɪ', 'ei'),
('ɐjɪ', 'ai'),
('ɔjɪ', 'oi'),
('ʊjɪ', 'ui')
]
if any(ph in ''.join(wiktionary_phonemes) and gr in graphemes for (ph, gr) in hiatus):
logging.warning(('The word <%s> might exhibit hiatus. Please verify if the transcription proposed by this ' + \
'script, namely /%s/, preserves the original hiatus in Wiktionary\'s /%s/, if any.') % substrings)
if should_skip:
logging.warning('The script failed to disambiguate the vowels in <%s> and will omit this word.' % graphemes)
else:
print(graphemes + '\t' + ' '.join(our_phonemes))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment