Skip to content

Instantly share code, notes, and snippets.

@lqc
Created November 26, 2009 01:35
Show Gist options
  • Save lqc/243163 to your computer and use it in GitHub Desktop.
Save lqc/243163 to your computer and use it in GitHub Desktop.
Buggy locale sorting in python
# -*- coding: utf-8 -*-
from __future__ import print_function
import sys
import locale
locale.setlocale(locale.LC_ALL, '')
print("Python version:", sys.version_info)
print("LC_ALL =", locale.getlocale(locale.LC_ALL))
print("LC_COLLATE =", locale.getlocale(locale.LC_COLLATE))
encoding = locale.getpreferredencoding()
print("Prefered encoding is:", encoding)
print()
alphabet = [
"a", "ą", "b", "c", "ć",
"d", "e", "ę", "f", "g",
"h", "i", "j", "k", "l",
"ł", "m", "n", "ń", "o",
"ó", "p", "r", "s", "ś",
"u", "v", "w", "y", "z",
"ź", "ż"
]
# Convert to unicode in python 2.x
if isinstance(alphabet[0], bytes):
alphabet = [ x.decode('utf-8') for x in alphabet ]
def run_test(label, sort_filter):
print(label)
try:
result = sort_filter(alphabet)
print(' '.join(result))
if result != alphabet:
print("FAILED: Bad result.")
except TypeError as e:
print("FAILED: Exception:", e)
except Exception as e:
print("FAILED: Exception:", e)
finally:
print()
# the original
run_test("Sorted alphabet", lambda x: x)
# Most efficient would be using strxfrm as key on the original values
#
# This is broken on Windows (locale: Polish_Poland)
# - on 2.6.4 in should fail with UnicodeDecodeError, instead yields bad results
# - on 3.1.1 yields bad results
#
# On Linux (locale pl_PL.UTF8):
# - works in 3.1.1 and trunk
# - yields the expected Exception in 2.6.4
run_test('Key=strxfrm(unicode)', lambda x: sorted(x, key=locale.strxfrm))
# Second option is to use strcoll (you can't in py3k)
#
# Works in 2.6.4 fine everywhere (why doesn't this yield the UnicodeDecodeError ?!)
#
run_test('Cmp=strcoll(unicode)', lambda x: sorted(x, cmp=locale.strcoll))
print("A is before Z", (locale.strcoll('a', 'z') < 0) )
print("P is after K", (locale.strcoll('p', 'k') > 0) )
print("Ą is before B", (locale.strcoll('ą', 'b') < 0) )
print()
# Next guess is to use strxfrm on strings encoded in the native coding
# Works in 2.6.4 - both Linux and Windows
# 3.1.1-Win: wrong anwser
# 3.1.1-Linux: throws exception (as it should)
encoded_key = lambda x: locale.strxfrm(x.encode(encoding))
run_test("Key=strxfrm(bytes_using_preferred_encoding)", lambda x: sorted(x, key=encoded_key))
encoded_cmp = lambda x, y: locale.strcoll(x.encode(encoding), y.encode(encoding))
run_test("Cmp=strcoll(bytes_using_preferred_encoding)", lambda x: sorted(x, cmp=encoded_cmp))
print("A is before Z", (locale.strcoll('a'.encode(encoding), 'z'.encode(encoding)) < 0) )
print("P is after K", (locale.strcoll('p'.encode(encoding), 'k'.encode(encoding)) > 0) )
print("Ą is before B", (locale.strcoll('ą'.encode(encoding), 'b'.encode(encoding)) < 0) )
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment