Wilfred/sort_esperanto.py

## sort_esperanto.py
#!/usr/bin/python
# -*- coding: utf-8 -*-

def compare_esperanto_strings(x_mixed_case, y_mixed_case):
    # case insensitive alphabetical sort
    # permitting whole latin alphabet
    if type(x_mixed_case) == str:
        x = x_mixed_case.decode('utf8').lower().strip()
    else:
        x = x_mixed_case.lower().strip()
    if type(y_mixed_case) == str:
        y = y_mixed_case.decode('utf8').lower().strip()
    else:
        y = y_mixed_case.lower().strip()

    # space is first in the alphabet so 'a b' comes before 'ab'
    # - second so that affixes come first
    # characters at the end just defensive programming
    alphabet = [u' ', u'-', u'a', u'b', u'c', u'\u0109', u'd', u'e', u'f', u'g', u'\u011d', u'h',
                u'\u0125', u'i', u'j', u'\u0135', u'k', u'l', u'm', u'n', u'o', u'p',
                u'q', u'r', u's', u'\u015d', u't', u'u', u'\u016d', u'v', u'w', u'x',
                u'y', u'z', u'\'', u'(', u')', u'.', u'*', u',', u'\u03c3',
                # that's a σ (sigma) at the end
                ]

    for i in range(min(len(x),len(y))):
        if alphabet.index(x[i]) < alphabet.index(y[i]):
            return -1
        elif alphabet.index(x[i]) > alphabet.index(y[i]):
            return 1

    # longer strings come afterwards
    if len(x) < len(y):
        return -1
    elif len(x) > len(y):
        return 1
    else:
        return 0

if __name__ == '__main__':
    dump = open('dump.txt', 'r')
    lines = dump.readlines()
    lines.sort(cmp=compare_esperanto_strings)

    for line in lines:
        print line.strip() # getting trailing newlines without for some reason
	#!/usr/bin/python
	# -- coding: utf-8 --

	def compare_esperanto_strings(x_mixed_case, y_mixed_case):
	# case insensitive alphabetical sort
	# permitting whole latin alphabet
	if type(x_mixed_case) == str:
	x = x_mixed_case.decode('utf8').lower().strip()
	else:
	x = x_mixed_case.lower().strip()
	if type(y_mixed_case) == str:
	y = y_mixed_case.decode('utf8').lower().strip()
	else:
	y = y_mixed_case.lower().strip()

	# space is first in the alphabet so 'a b' comes before 'ab'
	# - second so that affixes come first
	# characters at the end just defensive programming
	alphabet = [u' ', u'-', u'a', u'b', u'c', u'\u0109', u'd', u'e', u'f', u'g', u'\u011d', u'h',
	u'\u0125', u'i', u'j', u'\u0135', u'k', u'l', u'm', u'n', u'o', u'p',
	u'q', u'r', u's', u'\u015d', u't', u'u', u'\u016d', u'v', u'w', u'x',
	u'y', u'z', u'\'', u'(', u')', u'.', u'*', u',', u'\u03c3',
	# that's a σ (sigma) at the end
	]

	for i in range(min(len(x),len(y))):
	if alphabet.index(x[i]) < alphabet.index(y[i]):
	return -1
	elif alphabet.index(x[i]) > alphabet.index(y[i]):
	return 1

	# longer strings come afterwards
	if len(x) < len(y):
	return -1
	elif len(x) > len(y):
	return 1
	else:
	return 0

	if __name__ == '__main__':
	dump = open('dump.txt', 'r')
	lines = dump.readlines()
	lines.sort(cmp=compare_esperanto_strings)

	for line in lines:
	print line.strip() # getting trailing newlines without for some reason