lqc/python_locale_sort_bug.py

## python_locale_sort_bug.py
# -*- coding: utf-8 -*-

from __future__ import print_function

import sys

import locale
locale.setlocale(locale.LC_ALL, '')
print("Python version:", sys.version_info)
print("LC_ALL =", locale.getlocale(locale.LC_ALL))
print("LC_COLLATE =", locale.getlocale(locale.LC_COLLATE))

encoding = locale.getpreferredencoding()
print("Prefered encoding is:", encoding)
print()

alphabet = [
    "a", "ą", "b", "c", "ć",
    "d", "e", "ę", "f", "g",
    "h", "i", "j", "k", "l",
    "ł", "m", "n", "ń", "o",
    "ó", "p", "r", "s", "ś",
    "u", "v", "w", "y", "z",
    "ź", "ż"
]

# Convert to unicode in python 2.x
if isinstance(alphabet[0], bytes):
    alphabet = [ x.decode('utf-8') for x in alphabet ]

def run_test(label, sort_filter):

    print(label)
    try:
        result = sort_filter(alphabet)
        print(' '.join(result))
        if result != alphabet:
            print("FAILED: Bad result.")
    except TypeError as e:
        print("FAILED: Exception:", e)
    except Exception as e:
        print("FAILED: Exception:", e)
    finally:
        print()


# the original
run_test("Sorted alphabet", lambda x: x)


# Most efficient would be using strxfrm as key on the original values
#
# This is broken on Windows (locale: Polish_Poland)
#  - on 2.6.4 in should fail with UnicodeDecodeError, instead yields bad results
#  - on 3.1.1 yields bad results
#
# On Linux (locale pl_PL.UTF8):
#  - works in 3.1.1 and trunk
#  - yields the expected Exception in 2.6.4
run_test('Key=strxfrm(unicode)', lambda x: sorted(x, key=locale.strxfrm))

# Second option is to use strcoll (you can't in py3k)
#
# Works in 2.6.4 fine everywhere (why doesn't this yield the UnicodeDecodeError ?!)
#
run_test('Cmp=strcoll(unicode)', lambda x: sorted(x, cmp=locale.strcoll))
print("A is before Z", (locale.strcoll('a', 'z') < 0) )
print("P is after K", (locale.strcoll('p', 'k') > 0) )
print("Ą is before B", (locale.strcoll('ą', 'b') < 0) )
print()

# Next guess is to use strxfrm on strings encoded in the native coding
# Works in 2.6.4 - both Linux and Windows
# 3.1.1-Win: wrong anwser
# 3.1.1-Linux: throws exception (as it should)
encoded_key = lambda x: locale.strxfrm(x.encode(encoding))
run_test("Key=strxfrm(bytes_using_preferred_encoding)", lambda x: sorted(x, key=encoded_key))

encoded_cmp = lambda x, y: locale.strcoll(x.encode(encoding), y.encode(encoding))
run_test("Cmp=strcoll(bytes_using_preferred_encoding)", lambda x: sorted(x, cmp=encoded_cmp))
print("A is before Z", (locale.strcoll('a'.encode(encoding), 'z'.encode(encoding)) < 0) )
print("P is after K", (locale.strcoll('p'.encode(encoding), 'k'.encode(encoding)) > 0) )
print("Ą is before B", (locale.strcoll('ą'.encode(encoding), 'b'.encode(encoding)) < 0) )
print()
	# -- coding: utf-8 --

	from __future__ import print_function

	import sys

	import locale
	locale.setlocale(locale.LC_ALL, '')
	print("Python version:", sys.version_info)
	print("LC_ALL =", locale.getlocale(locale.LC_ALL))
	print("LC_COLLATE =", locale.getlocale(locale.LC_COLLATE))

	encoding = locale.getpreferredencoding()
	print("Prefered encoding is:", encoding)
	print()

	alphabet = [
	"a", "ą", "b", "c", "ć",
	"d", "e", "ę", "f", "g",
	"h", "i", "j", "k", "l",
	"ł", "m", "n", "ń", "o",
	"ó", "p", "r", "s", "ś",
	"u", "v", "w", "y", "z",
	"ź", "ż"
	]

	# Convert to unicode in python 2.x
	if isinstance(alphabet[0], bytes):
	alphabet = [ x.decode('utf-8') for x in alphabet ]

	def run_test(label, sort_filter):

	print(label)
	try:
	result = sort_filter(alphabet)
	print(' '.join(result))
	if result != alphabet:
	print("FAILED: Bad result.")
	except TypeError as e:
	print("FAILED: Exception:", e)
	except Exception as e:
	print("FAILED: Exception:", e)
	finally:
	print()



	# the original
	run_test("Sorted alphabet", lambda x: x)


	# Most efficient would be using strxfrm as key on the original values
	#
	# This is broken on Windows (locale: Polish_Poland)
	# - on 2.6.4 in should fail with UnicodeDecodeError, instead yields bad results
	# - on 3.1.1 yields bad results
	#
	# On Linux (locale pl_PL.UTF8):
	# - works in 3.1.1 and trunk
	# - yields the expected Exception in 2.6.4
	run_test('Key=strxfrm(unicode)', lambda x: sorted(x, key=locale.strxfrm))

	# Second option is to use strcoll (you can't in py3k)
	#
	# Works in 2.6.4 fine everywhere (why doesn't this yield the UnicodeDecodeError ?!)
	#
	run_test('Cmp=strcoll(unicode)', lambda x: sorted(x, cmp=locale.strcoll))
	print("A is before Z", (locale.strcoll('a', 'z') < 0) )
	print("P is after K", (locale.strcoll('p', 'k') > 0) )
	print("Ą is before B", (locale.strcoll('ą', 'b') < 0) )
	print()

	# Next guess is to use strxfrm on strings encoded in the native coding
	# Works in 2.6.4 - both Linux and Windows
	# 3.1.1-Win: wrong anwser
	# 3.1.1-Linux: throws exception (as it should)
	encoded_key = lambda x: locale.strxfrm(x.encode(encoding))
	run_test("Key=strxfrm(bytes_using_preferred_encoding)", lambda x: sorted(x, key=encoded_key))

	encoded_cmp = lambda x, y: locale.strcoll(x.encode(encoding), y.encode(encoding))
	run_test("Cmp=strcoll(bytes_using_preferred_encoding)", lambda x: sorted(x, cmp=encoded_cmp))
	print("A is before Z", (locale.strcoll('a'.encode(encoding), 'z'.encode(encoding)) < 0) )
	print("P is after K", (locale.strcoll('p'.encode(encoding), 'k'.encode(encoding)) > 0) )
	print("Ą is before B", (locale.strcoll('ą'.encode(encoding), 'b'.encode(encoding)) < 0) )
	print()