poppingtonic/english_to_tengwar.py

## english_to_tengwar.py
# This file converts English text to Tengwar, using my own personal
# preferences for transliterating Tengwar.
#
# Currently, the output that is created is intended for use with the Tengwar
# Annatar font and related font families.
#
# Example usage:
# >>> print convert("This was a triumph. I'm making a note here: huge success!")
#
# -- then paste the resulting text into a document rendered in Tengwar Annatar.
#
# Chelsea Voss, 2015


# Examples can either yield a single character, or a single character after a
# carrier

# In addition to this, some characters have multiple values, and those depend
# on the previous values


# Special characters: T for theta, D for eth
# R for pre-vowel r, S and Z for vowel-less s and z
# Q for rd, L for ld, W for wh, C for ch, K for kh, G for gh, X for sh, H for
# zh, N for ng

def dictzip(str1, str2):
    output = {}
    assert len(str1) == len(str2)
    l = len(str1)
    for i in xrange(l):
        output[str1[i]] = str2[i]
    return output


# So, English has two different pronunciations of 'th', and Tengwar
# distinguishes between them. TODO: use a library to determine which 'th'
# we're dealing with. In the meantime: voiced 'th' is the rare one,
# so these cases handle that.

# replace only the first instance of th
voiced_th_prefices = ['their', 'these', 'those', 'although',
                      'them', 'thine', 'thy', 'thou', 'there']

# replace only the second instance of th
voiced_th_special_prefices = ['thither']

# must be alone -- punctuation may extend them, but consider 'thank' -- these aren't prefixes
voiced_th_solo_prefices = ['that', 'this', 'than', 'they', 'thee', 'though']

# should have only one th apiece
voiced_th_always_safe = ['feather', 'together', 'bathing', 'bathe',
                         'father', 'mother', 'clothing', 'clothe',
                         'brother', 'weather', 'either', 'gather',
                         'other', 'another', 'worthy', 'rather', 'soothing',
                         'soothe', 'smooth', 'leather', 'tether', 'breathe',
                         'breathing', 'lathe', 'seethe', 'seething', 'scathe',
                         'scathing', 'teethe', 'teething', 'loath',
                         'loathing', 'neither', 'thence', 'rhythm',
                         'slither', 'southern', 'bother', 'altogether',
                         'lather', 'hither']

def replace_th(inp):
    for x in voiced_th_always_safe:
        if x in inp:
            inp = inp.replace(x, x.replace('th', 'TH'))
    for x in voiced_th_solo_prefices:
        if x == inp:
            inp = inp.replace('th', 'TH')
    for x in voiced_th_prefices:
        l = len(x)
        if inp[:l] == x:
            inp = inp.replace(x, x.replace('th', 'TH'))
    for x in voiced_th_special_prefices:
        l = len(x)
        if inp[:l] == x:
            inp = inp.replace(x, x.replace('th', 'TH', 2).replace('TH', 'th', 1))
    return inp

punctuation = {
    '.': u'-',
    ',': u'\xb7',
    '!': u'\xc1',
    '?': u'\xc0',
    ';': u'\xc3',
    '"': u'\xbb',
    '\'': u'\xb2',
    '_': u'\xc2',
    '-': u'\\',
    '`': u'\xb1',
    ':': '-',
    '/': u'\u203a',
    '\\': u'\u203a',
    '<': '*',
    '>': 'I',
    '[': '*',
    ']': 'I',
    '{': '*',
    '}': 'I',
    '(': '*',
    ')': 'I',
    '@': '1E',
    '#': '9dE1x#',
    '$': u'k\xa1',
    '%': 'q6R85$1',
    '^': 'z7D1R',
    '&': '5#2',
    '*': u'\u02c6',
    '=': u'\xac',
    '+': u'` \xb0',
    '|': u'\xbd',
    ' ': ' ',
    '\n': '\n',
    '\t': u'\xb7-\xb7',
}


def elfify_start(inp):
    import re
    split_inp = re.findall(r"[\w']+|[.,!\?;\"'-_`:<>/\\\[\]\(\){}@#$%^&\*=\+| \n]", inp)
    output = unicode('')
    for item in split_inp:
        output += unicode(elfify_token(item))
    return output

def elfify_token(item):
    if item in punctuation.keys():
        return punctuation[item]
    if item.isdigit():
        return elfify_number(int(item))
    item = item.replace("'","")
    return elfify_word(item)


def elfify_number(num):
    # TODO: implement fancy base-12 Elvish numerals
    return '`````'


def elfify_word(inp):
    inp = inp.lower()

    if inp == '':
        return inp

    # Detect 'of'
    if inp == 'of':
        return 'W'

    # Detect 'the'
    if inp == 'the':
        return '@'

    # Detect voiced th, replace with TH
    inp = replace_th(inp)

    # Detect hard and soft c and g
    for i in range(len(inp) - 1):
        first = inp[:i]
        cur = inp[i]
        rest = inp[i+1:]
        if cur == 'g':
            if rest[0] in 'eiy':
                inp = first + 'j' + rest
        elif cur == 'c':
            if rest[0] in 'eiy':
                inp = first + 's' + rest
            elif rest[0] in 'h':
                inp = first + 'C' + rest # Ch
            else:
                inp = first + 'k' + rest
    if inp[-1] == 'c':
        inp = inp[:-1] + 'k'

    # Detect places where we can use the pre-vowel r
    for i in range(len(inp) - 1):
        if inp[i] == 'r' and inp[i+1] in 'aeiouy':
            inp = inp[:i] + 'R' + inp[i+1:]

    # q == k
    inp = inp.replace('q', 'k')

    # Detect differences between consonant y (henceforth Y) and vowel y
    # All ys which do not come before a vowel are consonants
    # Hey, it's just like r!
    for i in range(len(inp) - 1):
        if inp[i] == 'y' and inp[i+1] in 'aeiou':
            inp = inp[:i] + 'Y' + inp[i+1:]

    # Detach the ending s if we notice one... and it's not after aiou
    if len(inp) > 0 and inp[-1] == 's':
        if len(inp) > 1 and inp[-2] not in 'aiou':
            inp = inp[:-1]
            has_trailing_s = True
        else:
            has_trailing_s = False
    else:
        has_trailing_s = False

    # Detach the ending e if we notice one -- note, it must be:
    # vowel THEN consonant THEN e
    if len(inp) >= 3 and inp[-1] == 'e' and inp[-2] not in 'aeiouy':
        inp = inp[:-1]
        has_trailing_e = True
    else:
        has_trailing_e = False

    # Elfification
    if len(inp) == 0:
        output = carrier
    else:
        output = elfify_postfix(inp)

    # Detect places where we can use the not-post-vowel s and z
    for i in range(len(output)-1):
        # fancy S
        if output[i] == 'i' and output[i+1] not in vowels:
            output = output[:i] + '8' + output[i+1:]
        # fancy Z
        if output[i] == ',' and output[i+1] not in vowels:
            output = output[:i] + 'k' + output[i+1:]

    # Add the ending e if we detached it earlier
    if has_trailing_e:
        output = output + 'O'

    # Add the ending s if we detached it earlier
    if has_trailing_s:
        if output[-1] in '7um8k':
            output = output + u'\xc5'
        elif output[-1] in 'qwertyo':
            output = output + u'\xc6'
        elif output[-1] in 'l9':
            output = output + u'\xa5'
        else:
            output = output + '_'

    return output

consonants = dictzip('tdnrRhpbfvmwsj--lYkg-z-',
                    u'125679qwertyisghjlzxn,.')

doubles = {
    'sh': 'd',
    'zh': 'f',
    'ch': 'a',
    'Ch': 'a',
    'ph': 'e',
    'kh': 'c',
    'gh': 'v',
    'wh': 'o',
    'ng': 'b',
    'rd': 'u',
    'ld': 'm',
    'th': '3',
    'TH': '4', # voiced
}

vowel_series = {
    'a': '#EDC',
    'e': '$RFV',
    'i': '%TGB',
    'o': '^YHN',
    'u': '&UJM',
    'y': u'\xd8\xd9\xda\xdb'
}

vowels = '#EDC$RFV%TGB^YHN&UJM'

# Index into the output of vowel_series.
# For example, a 0 before an A yields #.
vowels_for_consonants = {'`': 3, '~': 3, '1': 1, 'q': 1, 'a': 2, 'z': 2, '2': 0, 'w': 0, 's': 0, 'x': 0, '3': 2, 'e': 2, 'd': 1, 'c': 1, '4': 0, 'r': 0, 'f': 0, 'v': 0, '5': 0, 't': 0, 'g': 0, 'b': 0, '6': 1, 'y': 1, 'h': 2, 'n': 2, '7': 2, 'u': 2, 'j': 0, 'm': 0, 'i': 2, ',': 2, '9': 3, 'o': 0, 'l': 2, '.': 2, }

short_carrier = '`'
carrier = short_carrier
long_carrier = '~'

def elfify_postfix(postfix):
    if len(postfix) == 0:
        return ''

    # TODO: Actually add the appropriate character
    if not postfix[0].isalpha():
        return '`' + elfify_postfix(postfix[1:])

    # Check whether we can apply a double -- if so, apply and recurse
    for double in doubles:
        l = len(double)
        if postfix[:l] == double:
            return doubles[double] + elfify_postfix(postfix[l:])

    # Otherwise, apply the appropriate consonant or vowel placeholder
    nxt = postfix[0]
    postfix = postfix[1:]

    # If it's a vowel: Check whether the next thing == a vowel; if so, add the carrier.
    # If not, add the appropriate vowel for the consonant that's coming next.
    # This requires that we first recurse, then check!
    if nxt in vowel_series.keys():
        if len(postfix) == 0:
            next_consonant = carrier # add a carrier -- we're at the end of the word
        elif postfix[0] in vowel_series.keys():
            next_consonant = carrier # add a carrier -- the next thing == a vowel
        else:
            rest = elfify_postfix(postfix)
            next_consonant = rest[0]
            vowel_to_add = vowel_series[nxt][vowels_for_consonants[next_consonant]]
            return next_consonant + vowel_to_add + rest[1:]

        vowel_to_add = vowel_series[nxt][vowels_for_consonants[next_consonant]]
        return next_consonant + vowel_to_add + elfify_postfix(postfix)

    # If it's a consonant, add it!
    # TODO: Maybe add a doubler ('") if the next consonant == the same thing!
    if nxt in consonants.keys():
        next_consonant = consonants[nxt]
        return next_consonant + elfify_postfix(postfix)

    if nxt == 'x':
        return u'z\xe6' + elfify_postfix(postfix)

    # Otherwise, raise an error!
    else:
        raise NotImplementedError("%s, %s" % (nxt, postfix))

    # TODO: Fancy n-bars and w-bars here.

    return postfix

convert = elfify_start
	# This file converts English text to Tengwar, using my own personal
	# preferences for transliterating Tengwar.
	#
	# Currently, the output that is created is intended for use with the Tengwar
	# Annatar font and related font families.
	#
	# Example usage:
	# >>> print convert("This was a triumph. I'm making a note here: huge success!")
	#
	# -- then paste the resulting text into a document rendered in Tengwar Annatar.
	#
	# Chelsea Voss, 2015


	# Examples can either yield a single character, or a single character after a
	# carrier

	# In addition to this, some characters have multiple values, and those depend
	# on the previous values


	# Special characters: T for theta, D for eth
	# R for pre-vowel r, S and Z for vowel-less s and z
	# Q for rd, L for ld, W for wh, C for ch, K for kh, G for gh, X for sh, H for
	# zh, N for ng

	def dictzip(str1, str2):
	output = {}
	assert len(str1) == len(str2)
	l = len(str1)
	for i in xrange(l):
	output[str1[i]] = str2[i]
	return output


	# So, English has two different pronunciations of 'th', and Tengwar
	# distinguishes between them. TODO: use a library to determine which 'th'
	# we're dealing with. In the meantime: voiced 'th' is the rare one,
	# so these cases handle that.

	# replace only the first instance of th
	voiced_th_prefices = ['their', 'these', 'those', 'although',
	'them', 'thine', 'thy', 'thou', 'there']

	# replace only the second instance of th
	voiced_th_special_prefices = ['thither']

	# must be alone -- punctuation may extend them, but consider 'thank' -- these aren't prefixes
	voiced_th_solo_prefices = ['that', 'this', 'than', 'they', 'thee', 'though']

	# should have only one th apiece
	voiced_th_always_safe = ['feather', 'together', 'bathing', 'bathe',
	'father', 'mother', 'clothing', 'clothe',
	'brother', 'weather', 'either', 'gather',
	'other', 'another', 'worthy', 'rather', 'soothing',
	'soothe', 'smooth', 'leather', 'tether', 'breathe',
	'breathing', 'lathe', 'seethe', 'seething', 'scathe',
	'scathing', 'teethe', 'teething', 'loath',
	'loathing', 'neither', 'thence', 'rhythm',
	'slither', 'southern', 'bother', 'altogether',
	'lather', 'hither']

	def replace_th(inp):
	for x in voiced_th_always_safe:
	if x in inp:
	inp = inp.replace(x, x.replace('th', 'TH'))
	for x in voiced_th_solo_prefices:
	if x == inp:
	inp = inp.replace('th', 'TH')
	for x in voiced_th_prefices:
	l = len(x)
	if inp[:l] == x:
	inp = inp.replace(x, x.replace('th', 'TH'))
	for x in voiced_th_special_prefices:
	l = len(x)
	if inp[:l] == x:
	inp = inp.replace(x, x.replace('th', 'TH', 2).replace('TH', 'th', 1))
	return inp

	punctuation = {
	'.': u'-',
	',': u'\xb7',
	'!': u'\xc1',
	'?': u'\xc0',
	';': u'\xc3',
	'"': u'\xbb',
	'\'': u'\xb2',
	'_': u'\xc2',
	'-': u'\\',
	'`': u'\xb1',
	':': '-',
	'/': u'\u203a',
	'\\': u'\u203a',
	'<': '*',
	'>': 'I',
	'[': '*',
	']': 'I',
	'{': '*',
	'}': 'I',
	'(': '*',
	')': 'I',
	'@': '1E',
	'#': '9dE1x#',
	'$': u'k\xa1',
	'%': 'q6R85$1',
	'^': 'z7D1R',
	'&': '5#2',
	'*': u'\u02c6',
	'=': u'\xac',
	'+': u'` \xb0',
	'\|': u'\xbd',
	' ': ' ',
	'\n': '\n',
	'\t': u'\xb7-\xb7',
	}


	def elfify_start(inp):
	import re
	split_inp = re.findall(r"[\w']+\|[.,!\?;\"'-_`:<>/\\\[\]\(\){}@#$%^&\*=\+\| \n]", inp)
	output = unicode('')
	for item in split_inp:
	output += unicode(elfify_token(item))
	return output

	def elfify_token(item):
	if item in punctuation.keys():
	return punctuation[item]
	if item.isdigit():
	return elfify_number(int(item))
	item = item.replace("'","")
	return elfify_word(item)


	def elfify_number(num):
	# TODO: implement fancy base-12 Elvish numerals
	return '`````'


	def elfify_word(inp):
	inp = inp.lower()

	if inp == '':
	return inp

	# Detect 'of'
	if inp == 'of':
	return 'W'

	# Detect 'the'
	if inp == 'the':
	return '@'

	# Detect voiced th, replace with TH
	inp = replace_th(inp)

	# Detect hard and soft c and g
	for i in range(len(inp) - 1):
	first = inp[:i]
	cur = inp[i]
	rest = inp[i+1:]
	if cur == 'g':
	if rest[0] in 'eiy':
	inp = first + 'j' + rest
	elif cur == 'c':
	if rest[0] in 'eiy':
	inp = first + 's' + rest
	elif rest[0] in 'h':
	inp = first + 'C' + rest # Ch
	else:
	inp = first + 'k' + rest
	if inp[-1] == 'c':
	inp = inp[:-1] + 'k'

	# Detect places where we can use the pre-vowel r
	for i in range(len(inp) - 1):
	if inp[i] == 'r' and inp[i+1] in 'aeiouy':
	inp = inp[:i] + 'R' + inp[i+1:]

	# q == k
	inp = inp.replace('q', 'k')

	# Detect differences between consonant y (henceforth Y) and vowel y
	# All ys which do not come before a vowel are consonants
	# Hey, it's just like r!
	for i in range(len(inp) - 1):
	if inp[i] == 'y' and inp[i+1] in 'aeiou':
	inp = inp[:i] + 'Y' + inp[i+1:]

	# Detach the ending s if we notice one... and it's not after aiou
	if len(inp) > 0 and inp[-1] == 's':
	if len(inp) > 1 and inp[-2] not in 'aiou':
	inp = inp[:-1]
	has_trailing_s = True
	else:
	has_trailing_s = False
	else:
	has_trailing_s = False

	# Detach the ending e if we notice one -- note, it must be:
	# vowel THEN consonant THEN e
	if len(inp) >= 3 and inp[-1] == 'e' and inp[-2] not in 'aeiouy':
	inp = inp[:-1]
	has_trailing_e = True
	else:
	has_trailing_e = False

	# Elfification
	if len(inp) == 0:
	output = carrier
	else:
	output = elfify_postfix(inp)

	# Detect places where we can use the not-post-vowel s and z
	for i in range(len(output)-1):
	# fancy S
	if output[i] == 'i' and output[i+1] not in vowels:
	output = output[:i] + '8' + output[i+1:]
	# fancy Z
	if output[i] == ',' and output[i+1] not in vowels:
	output = output[:i] + 'k' + output[i+1:]

	# Add the ending e if we detached it earlier
	if has_trailing_e:
	output = output + 'O'

	# Add the ending s if we detached it earlier
	if has_trailing_s:
	if output[-1] in '7um8k':
	output = output + u'\xc5'
	elif output[-1] in 'qwertyo':
	output = output + u'\xc6'
	elif output[-1] in 'l9':
	output = output + u'\xa5'
	else:
	output = output + '_'

	return output

	consonants = dictzip('tdnrRhpbfvmwsj--lYkg-z-',
	u'125679qwertyisghjlzxn,.')

	doubles = {
	'sh': 'd',
	'zh': 'f',
	'ch': 'a',
	'Ch': 'a',
	'ph': 'e',
	'kh': 'c',
	'gh': 'v',
	'wh': 'o',
	'ng': 'b',
	'rd': 'u',
	'ld': 'm',
	'th': '3',
	'TH': '4', # voiced
	}

	vowel_series = {
	'a': '#EDC',
	'e': '$RFV',
	'i': '%TGB',
	'o': '^YHN',
	'u': '&UJM',
	'y': u'\xd8\xd9\xda\xdb'
	}

	vowels = '#EDC$RFV%TGB^YHN&UJM'

	# Index into the output of vowel_series.
	# For example, a 0 before an A yields #.
	vowels_for_consonants = {'`': 3, '~': 3, '1': 1, 'q': 1, 'a': 2, 'z': 2, '2': 0, 'w': 0, 's': 0, 'x': 0, '3': 2, 'e': 2, 'd': 1, 'c': 1, '4': 0, 'r': 0, 'f': 0, 'v': 0, '5': 0, 't': 0, 'g': 0, 'b': 0, '6': 1, 'y': 1, 'h': 2, 'n': 2, '7': 2, 'u': 2, 'j': 0, 'm': 0, 'i': 2, ',': 2, '9': 3, 'o': 0, 'l': 2, '.': 2, }

	short_carrier = '`'
	carrier = short_carrier
	long_carrier = '~'

	def elfify_postfix(postfix):
	if len(postfix) == 0:
	return ''

	# TODO: Actually add the appropriate character
	if not postfix[0].isalpha():
	return '`' + elfify_postfix(postfix[1:])

	# Check whether we can apply a double -- if so, apply and recurse
	for double in doubles:
	l = len(double)
	if postfix[:l] == double:
	return doubles[double] + elfify_postfix(postfix[l:])

	# Otherwise, apply the appropriate consonant or vowel placeholder
	nxt = postfix[0]
	postfix = postfix[1:]

	# If it's a vowel: Check whether the next thing == a vowel; if so, add the carrier.
	# If not, add the appropriate vowel for the consonant that's coming next.
	# This requires that we first recurse, then check!
	if nxt in vowel_series.keys():
	if len(postfix) == 0:
	next_consonant = carrier # add a carrier -- we're at the end of the word
	elif postfix[0] in vowel_series.keys():
	next_consonant = carrier # add a carrier -- the next thing == a vowel
	else:
	rest = elfify_postfix(postfix)
	next_consonant = rest[0]
	vowel_to_add = vowel_series[nxt][vowels_for_consonants[next_consonant]]
	return next_consonant + vowel_to_add + rest[1:]

	vowel_to_add = vowel_series[nxt][vowels_for_consonants[next_consonant]]
	return next_consonant + vowel_to_add + elfify_postfix(postfix)

	# If it's a consonant, add it!
	# TODO: Maybe add a doubler ('") if the next consonant == the same thing!
	if nxt in consonants.keys():
	next_consonant = consonants[nxt]
	return next_consonant + elfify_postfix(postfix)

	if nxt == 'x':
	return u'z\xe6' + elfify_postfix(postfix)

	# Otherwise, raise an error!
	else:
	raise NotImplementedError("%s, %s" % (nxt, postfix))

	# TODO: Fancy n-bars and w-bars here.

	return postfix

	convert = elfify_start