Skip to content

Instantly share code, notes, and snippets.

@apeiros
Created February 18, 2011 19:13
Show Gist options
  • Save apeiros/834225 to your computer and use it in GitHub Desktop.
Save apeiros/834225 to your computer and use it in GitHub Desktop.
Unicode String operations, case mapping, natural sorting
# Encoding: utf-8
# The following code is under BSD 2-clause license
# -> http://en.wikipedia.org/wiki/BSD_licenses#2-clause_license_.28.22Simplified_BSD_License.22_or_.22FreeBSD_License.22.29
#
# Author: Stefan Rusterholz <stefan.rusterholz@gmail.com> - https://github.com/apeiros/
#
# A module to help with transliteration issues.
# Provides methods for:
# * Changing the case of characters/strings, mapping most latin characters
# * Generate a value to perform natural sorting of strings (e.g. "10" > "2",
# "ä" < "z" etc.)
# * Transliterate a utf-8 string to 7bit chars only (e.g. map "ä" to "a" or "ae")
#
# All methods output valid utf-8 strings
#
# Beware, this module does not perform proper unicode collation based operations
module Transliterate
BiggestSortableNumber = ((128**128)-1)
SingleDigitToNaturalKey = %W[\u000a \u000b \u000c \u000d \u000e \u000f \u0010 \u0011 \u0012 \u0013 \u0014]
module_function
UpperToLower = Hash[%W[
A a
\u00c5 \u00e5
\u00c4 \u00e4
\u00c6 \u00e6
\u00c2 \u00e2
\u00c1 \u00e1
\u00c0 \u00e0
\u00c3 \u00e3
B b
C c
\u00c7 \u00e7
D d
E e
\u00cb \u00eb
\u00c9 \u00e9
\u00ca \u00ea
\u00c8 \u00e8
F f
G g
H h
I i
\u00cf \u00ef
\u00cd \u00ed
\u00ce \u00ee
\u00cc \u00ec
J j
K k
L l
M m
N n
\u00d1 \u00f1
O o
\u00d6 \u00f6
\u00d3 \u00f3
\u00d4 \u00f4
\u00d2 \u00f2
\u00d5 \u00f5
\u00d8 \u00f8
P p
Q q
R r
S s
\u1e9e \u00df
T t
U u
\u00dc \u00fc
\u00da \u00fa
\u00db \u00fb
\u00d9 \u00f9
V v
W w
X x
Y y
\u0178 \u00ff
Z z
].each_slice(2).to_a]
LowerToUpper = UpperToLower.invert
SwapCase = UpperToLower.merge(LowerToUpper)
UpperCase = UpperToLower.keys.join('')
LowerCase = LowerToUpper.keys.join('')
MatchUpper = /[#{Regexp.escape(UpperCase)}]/u
MatchLower = /[#{Regexp.escape(LowerCase)}]/u
MatchChar = /[#{Regexp.escape(SwapCase.keys.join(''))}]/u
MatchNoChar = /[^#{Regexp.escape(SwapCase.keys.join(''))}]/u
mixed_case = UpperToLower.dup
LowerToUpper.each do |key, value|
mixed_case[value] += key unless UpperToLower[value] == key
end
mixed_case = mixed_case.map { |upper,lower| upper+lower }
CaseSensitiveOrder = "!\"\#$%&'()*+,-./0123456789:;<=>?@[\\]^_`{|}~\u00a7\u00b2\u00bf#{UpperCase}#{LowerCase}"
CaseInsensitiveOrder = "!\"\#$%&'()*+,-./0123456789:;<=>?@[\\]^_`{|}~\u00a7\u00b2\u00bf".chars.to_a+mixed_case
Alphabet = (32...([CaseSensitiveOrder.length+32, (1<<15)+22528].max)).to_a.pack("U*")
CaseSensitiveIndex = Hash[CaseSensitiveOrder.chars.with_index.to_a.map { |char, index| [char, Alphabet[index]] }]
CaseInsensitiveIndex = {}
CaseInsensitiveOrder.each_with_index do |equivalents, index|
value = Alphabet[index]
equivalents.each_char { |char| CaseInsensitiveIndex[char] = value }
end
NaturalSortUnknown = /[\x00-\x19]|[^#{Regexp.escape(CaseSensitiveOrder)}]/u
NaturalSortKnown = /[#{Regexp.escape(CaseSensitiveOrder)}]/u
Transliterate = Hash[%W[
\u00c5 A A
\u00c4 A Ae
\u00c6 A AE
\u00c2 A A
\u00c0 A A
\u00c3 A A
\u00c7 C C
\u00cb E E
\u00c9 E E
\u00ca E E
\u00c8 E E
\u00cf I I
\u00ce I I
\u00cc I I
\u00d1 N N
\u00d6 O Oe
\u00d4 O O
\u00d2 O O
\u00d5 O O
\u00d8 O O
\u1e9e S SS
\u00dc U U
\u00db U U
\u00d9 U U
\u0178 Y Y
\u00e5 a a
\u00e4 a ae
\u00e6 a ae
\u00e1 a a
\u00e2 a a
\u00e0 a a
\u00e3 a a
\u00e7 c c
\u00eb e e
\u00ea e e
\u00e8 e e
\u00ef i i
\u00ed i i
\u00ee i i
\u00ec i i
\u00f1 n n
\u00f6 o oe
\u00f3 o o
\u00f4 o o
\u00f2 o o
\u00f5 o o
\u00f8 o o
\u00df s ss
\u00fc u u
\u00fa u u
\u00fb u u
\u00f9 u u
\u00ff y y
].each_slice(3).map { |char, map1, map2| [char, [map1, map2]] }]
# 7bit ascii characters
("A".."Z").each do |char| Transliterate[char] = [char, char] end
("a".."z").each do |char| Transliterate[char] = [char, char] end
("0".."9").each do |char| Transliterate[char] = [char, char] end
%W[! \" # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \\ ] ^ _ ` { | } ~].each do |char| Transliterate[char] = [char, char] end
Transliterate[' '] = [' ', ' ']
Transliterate1 = Hash[Transliterate.map { |key, (map1, map2)| [key, map1] }]
Transliterate2 = Hash[Transliterate.map { |key, (map1, map2)| [key, map2] }]
TransliterateMatch = /[#{Regexp.escape(Transliterate.keys.join(''))}]/u
# Maps non-7bit characters to a single 7 bit character, e.g. "ä" is mapped
# to "a"
#
# Example:
# Transliterate.transliterate "Größe" # => "Grose"
#
# Also see transliterate2, which tries to retain the meaning
def transliterate(string, replace_unknown="", extend_transliteration=nil)
if extend_transliteration then
match = Regexp.union(TransliterateMatch, /[#{Regexp.escape(extend_transliteration.keys.join(''))}]/u)
replace = Transliterate1.merge(extend_transliteration)
else
match = TransliterateMatch
replace = Transliterate1
end
string.encode(Encoding::UTF_8).gsub(match, replace)
end
# Maps non-7bit characters to a sequence of 7 bit characters which convey
# the same meaning, e.g. "ä" is mapped to "ae".
#
# Example:
# Transliterate.transliterate2 "Größe" # => "Groesse"
#
# Also see transliterate, which maps single characters to single characters
def transliterate2(string, replace_unknown="", extend_transliteration=nil)
if extend_transliteration then
match = Regexp.union(TransliterateMatch, /[#{Regexp.escape(extend_transliteration.keys.join(''))}]/u)
replace = Transliterate2.merge(extend_transliteration)
else
match = TransliterateMatch
replace = Transliterate2
end
string.encode(Encoding::UTF_8).gsub(match, replace)
end
def downcase(string)
string.encode(Encoding::UTF_8).gsub(MatchUpper, UpperToLower)
end
def upcase(string)
string.encode(Encoding::UTF_8).gsub(MatchLower, LowerToUpper)
end
def swapcase(string)
string.encode(Encoding::UTF_8).gsub(MatchChar, SwapCase)
end
# stable makes that differently cased letters don't
def case_insensitive_natural_sort_key(string, stable=true)
base = string.encode(Encoding::UTF_8).
tr("\t\n\r\v\f",' '). # convert whitespace to space
strip. # remove leading & trailing whitespace
squeeze(' '). # remove duplicate whitespace
gsub(NaturalSortUnknown, '') # remove unknown characters
part1 = base.
gsub(/[+-]?\d+/) { |value| digits_to_natural_key(value.to_i) }. # translate numbers (10 comes after 2, base 128 as it must be unicode-safe)
gsub(NaturalSortKnown, CaseInsensitiveIndex) # map characters
if stable then
part2 = case_sensitive_natural_sort_key(base.gsub(MatchNoChar, ''))
part1 + part2
else
part1
end
end
def case_sensitive_natural_sort_key(string)
string.encode(Encoding::UTF_8).
tr("\t\n\r\v\f",' '). # convert whitespace to space
strip. # remove leading & trailing whitespace
squeeze(' '). # remove duplicate whitespace
gsub(NaturalSortUnknown, ''). # remove unknown characters
gsub(/[+-]?\d+/) { |value| digits_to_natural_key(value.to_i) }. # translate numbers (10 comes after 2, base 128 as it must be unicode-safe)
gsub(NaturalSortKnown, CaseSensitiveIndex) # map characters
end
# performs a binary encoding whose result can be compared on a binary level correctly.
# To do that it uses a runtime length encoding of the number plus a base128 encoding.
# It works with numbers up to 128 digits long (in base 128), that means the biggest
# number correctly processed is 270 digits long in decimal:
# 5282945311356652463523397849165166065188473260361215221279607090266739025567248594…
# 7441725588765718789467439499325712867888234755950268553725053897846293957690838668399…
# 9005084168731517676426441053024232908211188404148028292751561738838396898767036476489…
# 538580897737998335 - a pretty friggin huge number. Larger numbers are truncated to
# that.
# The scheme used will never use more bytes than the ascii representation of the number
# used. E.g. 0-9 will still use 1 byte only. Larger numbers even use less space, e.g.
# 999_999_999_999 (12 bytes in decimal in ascii) can be represented by 8 bytes.
# Theoretically the implementation could be improved to support numbers as big as
# 128^(128^8).
def digits_to_natural_key(value)
# * 1-8 are for runtime length encoded negative numbers
# * 9 is for 1 digit (in base 128) negative numbers
# * 10-29 for single digits
# * 21 for 1 digit (in base 128) positive numbers
# * 22-29 for runtime length encoded positive numbers
if value.between?(0,9) then
SingleDigitToNaturalKey[value]
else
sign = value < 0
value = value.abs
value = BiggestSortableNumber if value > BiggestSortableNumber
converted = base128(value)
if converted.size == 1 then
converted << (sign ? 10 : 21)
else
converted << converted.size
converted << (sign ? 9 : 22) # simplified version
# converted << (sign ? 10-converted.size : 21+converted.size) # this version would enable very large numbers
end
converted.reverse.pack("U*").encode(Encoding::UTF_8) # must be big-endian
end
end
def base128(value)
base128 = []
begin
value, digit = value.divmod(128)
base128 << digit
end until value.zero?
base128
end
end
if __FILE__ == $0 then
include Transliterate
require 'test/unit'
class TestNatcmp < Test::Unit::TestCase
def test_natsortkey
assert_equal(%w(hello3.jpg Hello5.jpg hello10.jpg Hello329.jpg hello292349282912405965338291184.jpg), %w(hello292349282912405965338291184.jpg hello3.jpg Hello329.jpg hello10.jpg Hello5.jpg).sort_by { |a| case_insensitive_natural_sort_key(a) })
assert_equal(%w(Hello5.jpg hello3.jpg hello10.jpg), %w(hello3.jpg hello10.jpg Hello5.jpg).sort_by { |a| case_sensitive_natural_sort_key(a) })
assert_equal(%w(Hagar Hägar Hugar), %w(Hugar Hägar Hagar).sort_by { |a| case_sensitive_natural_sort_key(a) })
assert_equal(%w(Hagar Hugar Hägar), %w(Hugar Hägar Hagar).sort_by { |a| a })
assert(case_sensitive_natural_sort_key("Hägar20").valid_encoding?)
assert(case_insensitive_natural_sort_key("Hägar20").valid_encoding?)
assert_equal(%w(Ä ä), %w(ä Ä).sort_by { |a| case_insensitive_natural_sort_key(a, true) })
assert_equal(case_insensitive_natural_sort_key("Ä", false), case_insensitive_natural_sort_key("ä", false))
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment