antecedent/fix-lit-transcription.py

## fix-lit-transcription.py
import csv
import sys
import logging

def transcribe(spelling):
    defaults = {
        'a': '?A', # Either long/tense or short/lax; use Wiktionary transcription to resolve
        'ą': 'aː',
        'b': 'b',
        'c': 'ts',
        'č': 'tʃ',
        'd': 'd',
        'e': '?E', # Either long/tense or short/lax; use Wiktionary transcription to resolve
        'ę': 'æː',
        'ė': 'eː',
        'f': 'f',
        'g': 'ɡ',
        'h': 'h',
        'i': 'ɪ',
        'į': 'iː',
        'j': 'j',
        'k': 'k',
        'l': 'l',
        'm': 'm',
        'n': 'n',
        'o': '?O', # Distinction between /oː/ and the loan-phoneme /ɔ/; also use Wiktionary data
        'p': 'p',
        'r': 'r',
        's': 's',
        'š': 'ʃ',
        't': 't',
        'u': 'ʊ',
        'ų': 'uː',
        'ū': 'uː',
        'v': 'ʋ',
        'y': 'iː',
        'z': 'z',
        'ž': 'ʒ',
    }

    palatalization = 'ʲ'
    front_vowel_letters = set('iįyeėę')
    back_vowel_letters = set('aąouųū')
    consonant_letters = set('bcčdfghjklmnprsštvzž')
    sibilant_letters = set('sšzž')
    voiced_consonant_phonemes = ['b', 'd', 'ɡ', 'z', 'ʒ', 'dz', 'dʒ', 'h']
    voiceless_consonant_phonemes = ['p', 't', 'k', 's', 'ʃ', 'ts', 'tʃ', 'x']
    voiced_consonant_phonemes += [ph + palatalization for ph in voiced_consonant_phonemes]
    voiceless_consonant_phonemes += [ph + palatalization for ph in voiceless_consonant_phonemes]

    # 1. Flank with boundary sentinels
    # 2. Reverse, so that regressive processes are easier to simulate
    spelling = list(reversed('#' + spelling + '#'))
    transcription = []

    def displace(with_what):
        transcription[0] = with_what

    def insert(what):
        nonlocal transcription
        transcription = [what] + transcription

    # Flags for various types of regressive feature spreading
    should_palatalize = False
    should_voice = False
    should_devoice = True  # word-final devoicing

    for input_ahead, input_now, input_behind in zip(spelling[:-2], spelling[1:-1], spelling[2:]):

        output_ahead = transcription[0] if transcription else '#'
        output_now = defaults[input_now]
        coarticulation = palatalization if should_palatalize and input_now != 'j' else ''

        if input_now in consonant_letters:
            output_now += coarticulation
            if should_voice and output_now in voiceless_consonant_phonemes:
                output_now = voiced_consonant_phonemes[voiceless_consonant_phonemes.index(output_now)]
            elif should_devoice and output_now in voiced_consonant_phonemes:
                output_now = voiceless_consonant_phonemes[voiced_consonant_phonemes.index(output_now)]
            should_voice = should_voice or output_now in voiced_consonant_phonemes
            should_devoice = should_devoice or output_now in voiceless_consonant_phonemes
        else:
            should_voice = False
            should_devoice = False

        if input_now in back_vowel_letters:
            should_palatalize = False
        elif input_now in front_vowel_letters:
            should_palatalize = True

        # Digraphs
        if (input_now, input_ahead) == ('c', 'h'):
            displace('x' + coarticulation)
            should_devoice = True
        elif input_now == 'd' and output_ahead in ['z', 'zʲ', 'ʒ', 'ʒʲ']:
            displace('d' + output_ahead)
        elif (input_now, input_ahead) == ('i', 'e'):
            displace('iə')
        elif (input_now, input_ahead) == ('u', 'o'):
            displace('uə')
        # Collapse sibilant geminates and semi-geminates
        elif input_now in sibilant_letters and input_ahead in sibilant_letters:
            pass
        # Collapse true geminates of other kinds
        elif input_now == input_ahead and input_now in consonant_letters:
            pass
        # <i> before back vowels just marks that the preceding consonant cluster is palatalized
        elif input_now == 'i' and input_ahead in back_vowel_letters:
            should_palatalize = True
        elif palatalization in output_now or output_now == 'j':
            if output_ahead == '?A':
                displace('?E')
            elif output_ahead == 'aː':
                displace('æː')
            insert(output_now)
        else:
            insert(output_now)
    return transcription


reader = csv.reader(open(sys.argv[1]), delimiter='\t')
for graphemes, wiktionary_phonemes in reader:
    wiktionary_phonemes = wiktionary_phonemes.split()
    low_vowel_disambiguation = [ph for ph in wiktionary_phonemes if ph in ['aː', 'æː', 'a', 'æ', 'ɐ', 'ɛ', 'ɑ', 'ɑː']]
    mid_vowel_disambiguation = [ph for n, ph in enumerate(wiktionary_phonemes) \
                                if ph in ['ɔ', 'oː', 'o'] and (n == 0 or wiktionary_phonemes[n - 1] not in ['ʊ', 'u'])]
    our_phonemes = transcribe(graphemes)
    should_skip = False
    for n, ph in enumerate(our_phonemes):
        # Unambiguously long low vowel
        if ph in ['aː', 'æː']:
            low_vowel_disambiguation = low_vowel_disambiguation[1:]
        # Either /ɐ/ or /aː/ (also count /a, ɑ, ɑː/ as instances of the latter)
        elif ph == '?A':
            if not low_vowel_disambiguation:
                should_skip = True
                break
            d, low_vowel_disambiguation = low_vowel_disambiguation[0], low_vowel_disambiguation[1:]
            if d not in ['ɐ', 'aː', 'a', 'ɑ', 'ɑː']:
                should_skip = True
                break
            if d in ['a', 'ɑ', 'ɑː']:
                d = 'aː'
            our_phonemes[n] = d
        # Either /ɛ/ or /æː/ (also count /æ/ as an instance of the latter)
        elif ph == '?E':
            if not low_vowel_disambiguation:
                should_skip = True
                break
            d, low_vowel_disambiguation = low_vowel_disambiguation[0], low_vowel_disambiguation[1:]
            if d not in ['ɛ', 'æː', 'æ']:
                should_skip = True
                break
            if d == 'æ':
                d = 'æː'
            our_phonemes[n] = d
        # Either /ɔ/ or /oː/ (also count /o/ as an instance of the latter)
        elif ph == '?O':
            if not mid_vowel_disambiguation:
                should_skip = True
                break
            d, mid_vowel_disambiguation = mid_vowel_disambiguation[0], mid_vowel_disambiguation[1:]
            if d == 'o':
                d = 'oː'
            our_phonemes[n] = d

    substrings = graphemes, ''.join(our_phonemes), ''.join(wiktionary_phonemes)
    hiatus = [
        ('ɪjæ', 'ia'),
        ('ɪjɛ', 'ie'),
        ('ɪjɛ', 'ia'),
        ('ɪjɔ', 'io'),
        ('ɪjʊ', 'iu'),
        ('ɛjɪ', 'ei'),
        ('ɐjɪ', 'ai'),
        ('ɔjɪ', 'oi'),
        ('ʊjɪ', 'ui')
    ]
    if any(ph in ''.join(wiktionary_phonemes) and gr in graphemes for (ph, gr) in hiatus):
        logging.warning(('The word <%s> might exhibit hiatus. Please verify if the transcription proposed by this ' + \
                         'script, namely /%s/, preserves the original hiatus in Wiktionary\'s /%s/, if any.') % substrings)

    if should_skip:
        logging.warning('The script failed to disambiguate the vowels in <%s> and will omit this word.' % graphemes)
    else:
        print(graphemes + '\t' + ' '.join(our_phonemes))
	import csv
	import sys
	import logging

	def transcribe(spelling):
	defaults = {
	'a': '?A', # Either long/tense or short/lax; use Wiktionary transcription to resolve
	'ą': 'aː',
	'b': 'b',
	'c': 'ts',
	'č': 'tʃ',
	'd': 'd',
	'e': '?E', # Either long/tense or short/lax; use Wiktionary transcription to resolve
	'ę': 'æː',
	'ė': 'eː',
	'f': 'f',
	'g': 'ɡ',
	'h': 'h',
	'i': 'ɪ',
	'į': 'iː',
	'j': 'j',
	'k': 'k',
	'l': 'l',
	'm': 'm',
	'n': 'n',
	'o': '?O', # Distinction between /oː/ and the loan-phoneme /ɔ/; also use Wiktionary data
	'p': 'p',
	'r': 'r',
	's': 's',
	'š': 'ʃ',
	't': 't',
	'u': 'ʊ',
	'ų': 'uː',
	'ū': 'uː',
	'v': 'ʋ',
	'y': 'iː',
	'z': 'z',
	'ž': 'ʒ',
	}

	palatalization = 'ʲ'
	front_vowel_letters = set('iįyeėę')
	back_vowel_letters = set('aąouųū')
	consonant_letters = set('bcčdfghjklmnprsštvzž')
	sibilant_letters = set('sšzž')
	voiced_consonant_phonemes = ['b', 'd', 'ɡ', 'z', 'ʒ', 'dz', 'dʒ', 'h']
	voiceless_consonant_phonemes = ['p', 't', 'k', 's', 'ʃ', 'ts', 'tʃ', 'x']
	voiced_consonant_phonemes += [ph + palatalization for ph in voiced_consonant_phonemes]
	voiceless_consonant_phonemes += [ph + palatalization for ph in voiceless_consonant_phonemes]

	# 1. Flank with boundary sentinels
	# 2. Reverse, so that regressive processes are easier to simulate
	spelling = list(reversed('#' + spelling + '#'))
	transcription = []

	def displace(with_what):
	transcription[0] = with_what

	def insert(what):
	nonlocal transcription
	transcription = [what] + transcription

	# Flags for various types of regressive feature spreading
	should_palatalize = False
	should_voice = False
	should_devoice = True # word-final devoicing

	for input_ahead, input_now, input_behind in zip(spelling[:-2], spelling[1:-1], spelling[2:]):

	output_ahead = transcription[0] if transcription else '#'
	output_now = defaults[input_now]
	coarticulation = palatalization if should_palatalize and input_now != 'j' else ''

	if input_now in consonant_letters:
	output_now += coarticulation
	if should_voice and output_now in voiceless_consonant_phonemes:
	output_now = voiced_consonant_phonemes[voiceless_consonant_phonemes.index(output_now)]
	elif should_devoice and output_now in voiced_consonant_phonemes:
	output_now = voiceless_consonant_phonemes[voiced_consonant_phonemes.index(output_now)]
	should_voice = should_voice or output_now in voiced_consonant_phonemes
	should_devoice = should_devoice or output_now in voiceless_consonant_phonemes
	else:
	should_voice = False
	should_devoice = False

	if input_now in back_vowel_letters:
	should_palatalize = False
	elif input_now in front_vowel_letters:
	should_palatalize = True

	# Digraphs
	if (input_now, input_ahead) == ('c', 'h'):
	displace('x' + coarticulation)
	should_devoice = True
	elif input_now == 'd' and output_ahead in ['z', 'zʲ', 'ʒ', 'ʒʲ']:
	displace('d' + output_ahead)
	elif (input_now, input_ahead) == ('i', 'e'):
	displace('iə')
	elif (input_now, input_ahead) == ('u', 'o'):
	displace('uə')
	# Collapse sibilant geminates and semi-geminates
	elif input_now in sibilant_letters and input_ahead in sibilant_letters:
	pass
	# Collapse true geminates of other kinds
	elif input_now == input_ahead and input_now in consonant_letters:
	pass
	# <i> before back vowels just marks that the preceding consonant cluster is palatalized
	elif input_now == 'i' and input_ahead in back_vowel_letters:
	should_palatalize = True
	elif palatalization in output_now or output_now == 'j':
	if output_ahead == '?A':
	displace('?E')
	elif output_ahead == 'aː':
	displace('æː')
	insert(output_now)
	else:
	insert(output_now)
	return transcription


	reader = csv.reader(open(sys.argv[1]), delimiter='\t')
	for graphemes, wiktionary_phonemes in reader:
	wiktionary_phonemes = wiktionary_phonemes.split()
	low_vowel_disambiguation = [ph for ph in wiktionary_phonemes if ph in ['aː', 'æː', 'a', 'æ', 'ɐ', 'ɛ', 'ɑ', 'ɑː']]
	mid_vowel_disambiguation = [ph for n, ph in enumerate(wiktionary_phonemes) \
	if ph in ['ɔ', 'oː', 'o'] and (n == 0 or wiktionary_phonemes[n - 1] not in ['ʊ', 'u'])]
	our_phonemes = transcribe(graphemes)
	should_skip = False
	for n, ph in enumerate(our_phonemes):
	# Unambiguously long low vowel
	if ph in ['aː', 'æː']:
	low_vowel_disambiguation = low_vowel_disambiguation[1:]
	# Either /ɐ/ or /aː/ (also count /a, ɑ, ɑː/ as instances of the latter)
	elif ph == '?A':
	if not low_vowel_disambiguation:
	should_skip = True
	break
	d, low_vowel_disambiguation = low_vowel_disambiguation[0], low_vowel_disambiguation[1:]
	if d not in ['ɐ', 'aː', 'a', 'ɑ', 'ɑː']:
	should_skip = True
	break
	if d in ['a', 'ɑ', 'ɑː']:
	d = 'aː'
	our_phonemes[n] = d
	# Either /ɛ/ or /æː/ (also count /æ/ as an instance of the latter)
	elif ph == '?E':
	if not low_vowel_disambiguation:
	should_skip = True
	break
	d, low_vowel_disambiguation = low_vowel_disambiguation[0], low_vowel_disambiguation[1:]
	if d not in ['ɛ', 'æː', 'æ']:
	should_skip = True
	break
	if d == 'æ':
	d = 'æː'
	our_phonemes[n] = d
	# Either /ɔ/ or /oː/ (also count /o/ as an instance of the latter)
	elif ph == '?O':
	if not mid_vowel_disambiguation:
	should_skip = True
	break
	d, mid_vowel_disambiguation = mid_vowel_disambiguation[0], mid_vowel_disambiguation[1:]
	if d == 'o':
	d = 'oː'
	our_phonemes[n] = d

	substrings = graphemes, ''.join(our_phonemes), ''.join(wiktionary_phonemes)
	hiatus = [
	('ɪjæ', 'ia'),
	('ɪjɛ', 'ie'),
	('ɪjɛ', 'ia'),
	('ɪjɔ', 'io'),
	('ɪjʊ', 'iu'),
	('ɛjɪ', 'ei'),
	('ɐjɪ', 'ai'),
	('ɔjɪ', 'oi'),
	('ʊjɪ', 'ui')
	]
	if any(ph in ''.join(wiktionary_phonemes) and gr in graphemes for (ph, gr) in hiatus):
	logging.warning(('The word <%s> might exhibit hiatus. Please verify if the transcription proposed by this ' + \
	'script, namely /%s/, preserves the original hiatus in Wiktionary\'s /%s/, if any.') % substrings)

	if should_skip:
	logging.warning('The script failed to disambiguate the vowels in <%s> and will omit this word.' % graphemes)
	else:
	print(graphemes + '\t' + ' '.join(our_phonemes))