yokolet/kata_transcription.py

## kata_transcription.py
# https://en.wikibooks.org/wiki/Japanese/Transcribing_English_to_Japanese

import os
import sys
sys.path.append(os.path.abspath('./English-to-IPA'))

from pykakasi import kakasi
import romkan
import eng_to_ipa as ipa
import re

vowels = 'aeiou'
vowsym = 'ɑiɪɛɜæʌɒɔʊuəeajo'
conssym = 'θðlŋjhwvʃʧtdkpgm'

def a_short_rule(word, w_idx):
    # ɑ - a
    if word[w_idx] == 'o':
        return 'o'
    else:
        return 'a'

def a_long_rule(word, w_idx):
    # ɑː or ar - aa, a
    return 'aa'

def i_long_rule(word, w_idx):
    # iː or i
    return 'ii'

def i_short_rule(word, w_idx):
    # ɪ
    return 'i'

def e_short_rule(word, w_idx):
    # ɛ
    return 'e'

def e_long_rule(word, w_idx):
    # ɜː - aa, a
    return 'aa'

def ae_rule(word, w_idx):
    # æ
    if w_idx >= 1 and (word[w_idx-1] == 'c' or word[w_idx-1] == 'g'):
        return 'ya'
    else:
        return 'a'

def hat_rule(word, w_idx):
    # ʌ
    if word[w_idx] == 'o':
        return 'o'
    else:
        return 'a'

def o_short_rule(word, w_idx):
    # ɒ
    return 'o'

def o_long_rule(word, w_idx):
    # ɔː
    return 'oo'

def u_short_rule(word, w_idx):
    # ʊ
    return 'u'

def u_long_rule(word, w_idx):
    # uː
    return 'uu'

def e2_rule(word, w_idx):
    # ə
    #print('e2_rule', word, w_idx)
    if w_idx >= 2 and \
        word[w_idx-2:w_idx] in ['bl', 'gl']:
        return 'u'
    elif w_idx == len(word)-1:
        return 'a'
    elif word[w_idx] == 'o':
        if w_idx < len(word)-1 and word[w_idx+1] == 'u':
            return 'a'
        else:
            return 'o'
    else:
        return 'a'

def ei_rule(word, w_idx):
    # eɪ or e - ei, ee, e
    if w_idx < len(word)-1 and word[w_idx+1] == 'y':
        return 'ei'
    else:
        return 'e'

def ai_rule(word, w_idx):
    # aɪ
    return 'ai'

def oi_rule(word, w_idx):
    # ɔɪ - ooi, oi
    return 'ooi'

def eu_rule(word, w_idx):
    # əʊ - o, oo
    return 'oo'

def au_rule(word, w_idx):
    # aʊ
    return 'au'

def ie_rule(word, w_idx):
    # ɪə - ia, iaa
    return 'iaa'

def ee_rule(word, w_idx):
    # ɛə - ea, eaa
    return 'eaa'

def ue_rule(word, w_idx):
    # ʊə
    return 'uaa'

def ju_long_rule(word, w_idx):
    # juː
    return 'yuu'

def jvow_rule(word, w_idx):
    # juː
    return 'j'

vowel_map = {
    'ɑ': a_short_rule,
    'ɑː': a_long_rule,
    'ɑr': a_long_rule,
    'iː': i_long_rule,
    'i': i_long_rule,
    'ɪ': i_short_rule,
    'ɛ': e_short_rule,
    'ɜː': e_long_rule,
    'æ': ae_rule,
    'ʌ': hat_rule,
    'ɒ': o_short_rule,
    'ɔː': o_long_rule,
    'ɔ': o_long_rule,
    'ɔr': o_long_rule,
    'ʊ': u_short_rule,
    'uː': u_long_rule,
    'u': u_long_rule,
    'ə': e2_rule,
    'ər': a_long_rule,
    'eɪ': ei_rule,
    'e': ei_rule,
    'aɪ': ai_rule,
    'ɔɪ': oi_rule,
    'əʊ': eu_rule,
    'oʊ': eu_rule,
    'aʊ': au_rule,
    'ɪə': ie_rule,
    'ɪr': ie_rule,
    'ɛə': ee_rule,
    'ɛr': ee_rule,
    'ʊə': ue_rule,
    'ʊr': ue_rule,
    'juː': ju_long_rule,
    'ju': ju_long_rule,
    'j': jvow_rule,
}
# æ after k => kya
# æ after g => gya
# ʌ spelt with an "o" => o, ex: monkey, front
# non-final ə => ?, ex: about, pilot, london
# final position ə spelt as "-r" => aa
# final position ə spelt with an "a" => a

def transVowel(word, ph):
    result = ''
    w_idx = 0
    p_idx = 0
    while w_idx < len(word) or p_idx < len(ph):
        # consonant: adds as is
        while w_idx < len(word) and word[w_idx] not in vowels:
            #result += word[w_idx]
            w_idx += 1
        # consonant phonetics: skips for now
        while p_idx < len(ph) and ph[p_idx] not in vowsym:
            result += ph[p_idx]
            p_idx += 1

        # checks vowel phonetics
        if p_idx+3 <= len(ph) and ph[p_idx:p_idx+3] == 'juː':
            result += vowel_map['juː'](word, w_idx)
            p_idx += 3
        elif p_idx+2 <= len(ph) and ph[p_idx:p_idx+2] in vowel_map:
            result += vowel_map[ph[p_idx:p_idx+2]](word, w_idx)
            p_idx += 2
        elif p_idx < len(ph):
            result += vowel_map[ph[p_idx]](word, w_idx)
            p_idx += 1

        # vowel chars may be more than one in word, skips those
        while w_idx < len(word) and word[w_idx] in vowels:
            w_idx += 1
    return result

def th_clear_rule(word, ph, p_idx):
    # θ
    return 's'

def th_hakuon_rule(word, ph, p_idx):
    # ð
    return 'z'

def l_rule(word, ph, p_idx):
    # l
    return 'r'

def n_rule(word, ph, p_idx):
    # ŋ - Ng, N
    if 'ng' in word:
        return 'Ng'
    else:
        return 'N'

def jcon_rule(word, ph, p_idx):
    # j (before the sounds i, ɪ, or e)
    if p_idx < len(ph)-1 and ph[p_idx+1] in 'iɪe':
        return 'i'
    else:
        return 'j'

def h_rule(word, ph, p_idx):
    # h (before the sounds u or ʊ)
    if p_idx < len(ph)-1 and ph[p_idx+1] in 'uʊ':
        return 'f'
    else:
        return 'h'

def w_rule(word, ph, p_idx):
    if p_idx == 0 and word[0:2] == 'wh':
        return 'how'
    else:
        return 'u'

def v_rule(word, ph, p_idx):
    return 'b'

def s_rule(word, ph, p_idx):
    # ʃ
    if len(ph) >= 2 and ph[-1] == 'ʃ' and ph[-2] in vowsym:
        return 'sshu'
    else:
        return 'sh'

def ts_rule(word, ph, p_idx):
    # ʧ
    if p_idx >= 1 and ph[p_idx] == 'ʧ' and \
         ph[p_idx-1] in vowsym and \
         (len(ph) <= 2 or ph[p_idx-2] not in vowsym):
        return 'cchi'
    else:
        return 'ch'

def t_rule(word, ph, p_idx):
    # t - tto
    if p_idx >= 1 and ph[p_idx] == 't' and \
        ph[p_idx-1] in vowsym and \
        (len(ph) <= 2 or ph[p_idx-2] not in vowsym):
        return 'tto'
    else:
        return 't'

def d_rule(word, ph, p_idx):
    # d in the end - ddo
    if p_idx < len(ph)-1 and ph[p_idx+1] == 'z':
        return 'z'
    elif p_idx >= 1 and ph[p_idx] == 'd' and \
         ph[p_idx-1] in vowsym and \
         (len(ph) <= 2 or ph[p_idx-2] not in vowsym):
        return 'ddo'
    else:
        return 'd'

def kpg_rule(word, ph, p_idx):
    # k, p -- kku, ppu
    if p_idx >= 1 and ph[p_idx] in 'kpg' and \
        ph[p_idx-1] in vowsym and \
        (len(ph) <= 2 or ph[p_idx-2] not in vowsym):
        return ph[p_idx]+ph[p_idx]+'u'
    else:
        return ph[p_idx]

def m_rule(word, ph, p_idx):
    # m not followed by vowel - n
    if p_idx < len(ph)-1 and ph[p_idx+1] not in vowsym:
        return 'n'
    else:
        return 'm'

consonant_map = {
    'θ': th_clear_rule,
    'ð': th_hakuon_rule,
    'l': l_rule,
    'ŋ': n_rule,
    'j': jcon_rule,
    'h': h_rule,
    'w': w_rule,
    'v': v_rule,
    'ʃ': s_rule,
    'ʧ': ts_rule,
    't': t_rule,
    'd': d_rule,
    'k': kpg_rule,
    'p': kpg_rule,
    'g': kpg_rule,
    'm': m_rule
}

def transConsonants(word, step2):
    idx = 0
    result = ''
    while idx < len(step2):
        while idx < len(step2) and step2[idx] not in conssym:
            result += step2[idx]
            idx += 1
        if idx < len(step2):
            result += consonant_map[step2[idx]](word, step2, idx)
            idx += 1
    return result

def transcribe(word):
    # step 1: make phonetic
    ph = ipa.convert(word)
    # step 2: convert vowels
    step2 = transVowel(word, ph)
    #print(word, ph, step2)
    # step 3: convert consonants
    step3 = transConsonants(word, step2)
    print(word, ph, step2, step3)
    # step 4: add epenthtic vowels
	# https://en.wikibooks.org/wiki/Japanese/Transcribing_English_to_Japanese

	import os
	import sys
	sys.path.append(os.path.abspath('./English-to-IPA'))

	from pykakasi import kakasi
	import romkan
	import eng_to_ipa as ipa
	import re

	vowels = 'aeiou'
	vowsym = 'ɑiɪɛɜæʌɒɔʊuəeajo'
	conssym = 'θðlŋjhwvʃʧtdkpgm'

	def a_short_rule(word, w_idx):
	# ɑ - a
	if word[w_idx] == 'o':
	return 'o'
	else:
	return 'a'

	def a_long_rule(word, w_idx):
	# ɑː or ar - aa, a
	return 'aa'

	def i_long_rule(word, w_idx):
	# iː or i
	return 'ii'

	def i_short_rule(word, w_idx):
	# ɪ
	return 'i'

	def e_short_rule(word, w_idx):
	# ɛ
	return 'e'

	def e_long_rule(word, w_idx):
	# ɜː - aa, a
	return 'aa'

	def ae_rule(word, w_idx):
	# æ
	if w_idx >= 1 and (word[w_idx-1] == 'c' or word[w_idx-1] == 'g'):
	return 'ya'
	else:
	return 'a'

	def hat_rule(word, w_idx):
	# ʌ
	if word[w_idx] == 'o':
	return 'o'
	else:
	return 'a'

	def o_short_rule(word, w_idx):
	# ɒ
	return 'o'

	def o_long_rule(word, w_idx):
	# ɔː
	return 'oo'

	def u_short_rule(word, w_idx):
	# ʊ
	return 'u'

	def u_long_rule(word, w_idx):
	# uː
	return 'uu'

	def e2_rule(word, w_idx):
	# ə
	#print('e2_rule', word, w_idx)
	if w_idx >= 2 and \
	word[w_idx-2:w_idx] in ['bl', 'gl']:
	return 'u'
	elif w_idx == len(word)-1:
	return 'a'
	elif word[w_idx] == 'o':
	if w_idx < len(word)-1 and word[w_idx+1] == 'u':
	return 'a'
	else:
	return 'o'
	else:
	return 'a'

	def ei_rule(word, w_idx):
	# eɪ or e - ei, ee, e
	if w_idx < len(word)-1 and word[w_idx+1] == 'y':
	return 'ei'
	else:
	return 'e'

	def ai_rule(word, w_idx):
	# aɪ
	return 'ai'

	def oi_rule(word, w_idx):
	# ɔɪ - ooi, oi
	return 'ooi'

	def eu_rule(word, w_idx):
	# əʊ - o, oo
	return 'oo'

	def au_rule(word, w_idx):
	# aʊ
	return 'au'

	def ie_rule(word, w_idx):
	# ɪə - ia, iaa
	return 'iaa'

	def ee_rule(word, w_idx):
	# ɛə - ea, eaa
	return 'eaa'

	def ue_rule(word, w_idx):
	# ʊə
	return 'uaa'

	def ju_long_rule(word, w_idx):
	# juː
	return 'yuu'

	def jvow_rule(word, w_idx):
	# juː
	return 'j'

	vowel_map = {
	'ɑ': a_short_rule,
	'ɑː': a_long_rule,
	'ɑr': a_long_rule,
	'iː': i_long_rule,
	'i': i_long_rule,
	'ɪ': i_short_rule,
	'ɛ': e_short_rule,
	'ɜː': e_long_rule,
	'æ': ae_rule,
	'ʌ': hat_rule,
	'ɒ': o_short_rule,
	'ɔː': o_long_rule,
	'ɔ': o_long_rule,
	'ɔr': o_long_rule,
	'ʊ': u_short_rule,
	'uː': u_long_rule,
	'u': u_long_rule,
	'ə': e2_rule,
	'ər': a_long_rule,
	'eɪ': ei_rule,
	'e': ei_rule,
	'aɪ': ai_rule,
	'ɔɪ': oi_rule,
	'əʊ': eu_rule,
	'oʊ': eu_rule,
	'aʊ': au_rule,
	'ɪə': ie_rule,
	'ɪr': ie_rule,
	'ɛə': ee_rule,
	'ɛr': ee_rule,
	'ʊə': ue_rule,
	'ʊr': ue_rule,
	'juː': ju_long_rule,
	'ju': ju_long_rule,
	'j': jvow_rule,
	}
	# æ after k => kya
	# æ after g => gya
	# ʌ spelt with an "o" => o, ex: monkey, front
	# non-final ə => ?, ex: about, pilot, london
	# final position ə spelt as "-r" => aa
	# final position ə spelt with an "a" => a

	def transVowel(word, ph):
	result = ''
	w_idx = 0
	p_idx = 0
	while w_idx < len(word) or p_idx < len(ph):
	# consonant: adds as is
	while w_idx < len(word) and word[w_idx] not in vowels:
	#result += word[w_idx]
	w_idx += 1
	# consonant phonetics: skips for now
	while p_idx < len(ph) and ph[p_idx] not in vowsym:
	result += ph[p_idx]
	p_idx += 1

	# checks vowel phonetics
	if p_idx+3 <= len(ph) and ph[p_idx:p_idx+3] == 'juː':
	result += vowel_map['juː'](word, w_idx)
	p_idx += 3
	elif p_idx+2 <= len(ph) and ph[p_idx:p_idx+2] in vowel_map:
	result += vowel_map[ph[p_idx:p_idx+2]](word, w_idx)
	p_idx += 2
	elif p_idx < len(ph):
	result += vowel_map[ph[p_idx]](word, w_idx)
	p_idx += 1

	# vowel chars may be more than one in word, skips those
	while w_idx < len(word) and word[w_idx] in vowels:
	w_idx += 1
	return result

	def th_clear_rule(word, ph, p_idx):
	# θ
	return 's'

	def th_hakuon_rule(word, ph, p_idx):
	# ð
	return 'z'

	def l_rule(word, ph, p_idx):
	# l
	return 'r'

	def n_rule(word, ph, p_idx):
	# ŋ - Ng, N
	if 'ng' in word:
	return 'Ng'
	else:
	return 'N'

	def jcon_rule(word, ph, p_idx):
	# j (before the sounds i, ɪ, or e)
	if p_idx < len(ph)-1 and ph[p_idx+1] in 'iɪe':
	return 'i'
	else:
	return 'j'

	def h_rule(word, ph, p_idx):
	# h (before the sounds u or ʊ)
	if p_idx < len(ph)-1 and ph[p_idx+1] in 'uʊ':
	return 'f'
	else:
	return 'h'

	def w_rule(word, ph, p_idx):
	if p_idx == 0 and word[0:2] == 'wh':
	return 'how'
	else:
	return 'u'

	def v_rule(word, ph, p_idx):
	return 'b'

	def s_rule(word, ph, p_idx):
	# ʃ
	if len(ph) >= 2 and ph[-1] == 'ʃ' and ph[-2] in vowsym:
	return 'sshu'
	else:
	return 'sh'

	def ts_rule(word, ph, p_idx):
	# ʧ
	if p_idx >= 1 and ph[p_idx] == 'ʧ' and \
	ph[p_idx-1] in vowsym and \
	(len(ph) <= 2 or ph[p_idx-2] not in vowsym):
	return 'cchi'
	else:
	return 'ch'

	def t_rule(word, ph, p_idx):
	# t - tto
	if p_idx >= 1 and ph[p_idx] == 't' and \
	ph[p_idx-1] in vowsym and \
	(len(ph) <= 2 or ph[p_idx-2] not in vowsym):
	return 'tto'
	else:
	return 't'

	def d_rule(word, ph, p_idx):
	# d in the end - ddo
	if p_idx < len(ph)-1 and ph[p_idx+1] == 'z':
	return 'z'
	elif p_idx >= 1 and ph[p_idx] == 'd' and \
	ph[p_idx-1] in vowsym and \
	(len(ph) <= 2 or ph[p_idx-2] not in vowsym):
	return 'ddo'
	else:
	return 'd'

	def kpg_rule(word, ph, p_idx):
	# k, p -- kku, ppu
	if p_idx >= 1 and ph[p_idx] in 'kpg' and \
	ph[p_idx-1] in vowsym and \
	(len(ph) <= 2 or ph[p_idx-2] not in vowsym):
	return ph[p_idx]+ph[p_idx]+'u'
	else:
	return ph[p_idx]

	def m_rule(word, ph, p_idx):
	# m not followed by vowel - n
	if p_idx < len(ph)-1 and ph[p_idx+1] not in vowsym:
	return 'n'
	else:
	return 'm'

	consonant_map = {
	'θ': th_clear_rule,
	'ð': th_hakuon_rule,
	'l': l_rule,
	'ŋ': n_rule,
	'j': jcon_rule,
	'h': h_rule,
	'w': w_rule,
	'v': v_rule,
	'ʃ': s_rule,
	'ʧ': ts_rule,
	't': t_rule,
	'd': d_rule,
	'k': kpg_rule,
	'p': kpg_rule,
	'g': kpg_rule,
	'm': m_rule
	}

	def transConsonants(word, step2):
	idx = 0
	result = ''
	while idx < len(step2):
	while idx < len(step2) and step2[idx] not in conssym:
	result += step2[idx]
	idx += 1
	if idx < len(step2):
	result += consonant_map[step2[idx]](word, step2, idx)
	idx += 1
	return result

	def transcribe(word):
	# step 1: make phonetic
	ph = ipa.convert(word)
	# step 2: convert vowels
	step2 = transVowel(word, ph)
	#print(word, ph, step2)
	# step 3: convert consonants
	step3 = transConsonants(word, step2)
	print(word, ph, step2, step3)
	# step 4: add epenthtic vowels