Created
April 12, 2010 19:42
-
-
Save norman/363923 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From 059e245874dd38843986a8c0226c4dfad835bfff Mon Sep 17 00:00:00 2001 | |
From: Norman Clarke <norman@njclarke.com> | |
Date: Mon, 12 Apr 2010 12:44:25 -0300 | |
Subject: [PATCH] Improve reliability of Inflector.transliterate. [#4374 state:resolved] | |
--- | |
.../lib/active_support/inflector/transliterate.rb | 61 ++++++++++++-------- | |
activesupport/test/inflector_test_cases.rb | 5 +- | |
activesupport/test/transliterate_test.rb | 50 ++++++++++++++++ | |
3 files changed, 91 insertions(+), 25 deletions(-) | |
create mode 100644 activesupport/test/transliterate_test.rb | |
diff --git a/activesupport/lib/active_support/inflector/transliterate.rb b/activesupport/lib/active_support/inflector/transliterate.rb | |
index ca591ab..9c99dcf 100644 | |
--- a/activesupport/lib/active_support/inflector/transliterate.rb | |
+++ b/activesupport/lib/active_support/inflector/transliterate.rb | |
@@ -1,32 +1,47 @@ | |
# encoding: utf-8 | |
-require 'iconv' | |
-require 'kconv' | |
require 'active_support/core_ext/string/multibyte' | |
module ActiveSupport | |
module Inflector | |
extend self | |
- | |
- # Replaces accented characters with their ascii equivalents. | |
- def transliterate(string) | |
- Iconv.iconv('ascii//ignore//translit', 'utf-8', string).to_s | |
- end | |
- if RUBY_VERSION >= '1.9' | |
- undef_method :transliterate | |
- def transliterate(string) | |
- proxy = ActiveSupport::Multibyte.proxy_class.new(string) | |
- proxy.normalize(:kd).gsub(/[^\x00-\x7F]+/, '') | |
- end | |
+ # UTF-8 byte => ASCII approximate UTF-8 byte(s) | |
+ ASCII_APPROXIMATIONS = { | |
+ 198 => [65, 69], # Æ => AE | |
+ 208 => 68, # Ð => D | |
+ 216 => 79, # Ø => O | |
+ 222 => [84, 104], # Þ => Þ | |
+ 223 => [115, 115], # ß => ss | |
+ 230 => [97, 101], # æ => ae | |
+ 240 => 100, # ð => d | |
+ 248 => 111, # ø => o | |
+ 254 => [116, 104], # þ => th | |
+ 272 => 68, # Đ => D | |
+ 273 => 100, # đ => đ | |
+ 294 => 72, # Ħ => H | |
+ 295 => 104, # ħ => h | |
+ 305 => 105, # ı => i | |
+ 306 => [73, 74], # IJ =>IJ | |
+ 307 => [105, 106], # ij => ij | |
+ 312 => 107, # ĸ => k | |
+ 319 => 76, # Ŀ => L | |
+ 320 => 108, # ŀ => l | |
+ 321 => 76, # Ł => L | |
+ 322 => 108, # ł => l | |
+ 329 => 110, # ʼn => n | |
+ 330 => [78, 71], # Ŋ => NG | |
+ 331 => [110, 103], # ŋ => ng | |
+ 338 => [79, 69], # Œ => OE | |
+ 339 => [111, 101], # œ => oe | |
+ 358 => 84, # Ŧ => T | |
+ 359 => 116 # ŧ => t | |
+ } | |
- # The iconv transliteration code doesn't function correctly | |
- # on some platforms, but it's very fast where it does function. | |
- elsif "foo" != (Inflector.transliterate("föö") rescue nil) | |
- undef_method :transliterate | |
- def transliterate(string) | |
- string.mb_chars.normalize(:kd). # Decompose accented characters | |
- gsub(/[^\x00-\x7F]+/, '') # Remove anything non-ASCII entirely (e.g. diacritics). | |
- end | |
+ # Replaces accented characters with an ASCII approximation, or deletes it if none exsits. | |
+ def transliterate(string) | |
+ ActiveSupport::Multibyte::Chars.new(string).tidy_bytes.normalize(:d).unpack("U*").map do |char| | |
+ ASCII_APPROXIMATIONS[char] || (char if char < 128) | |
+ end.compact.flatten.pack("U*") | |
end | |
# Replaces special characters in a string so that it may be used as part of a 'pretty' URL. | |
@@ -45,8 +60,6 @@ module ActiveSupport | |
# <%= link_to(@person.name, person_path(@person)) %> | |
# # => <a href="/person/1-donald-e-knuth">Donald E. Knuth</a> | |
def parameterize(string, sep = '-') | |
- # remove malformed utf8 characters | |
- string = string.toutf8 unless string.is_utf8? | |
# replace accented chars with their ascii equivalents | |
parameterized_string = transliterate(string) | |
# Turn unwanted chars into the separator | |
@@ -59,6 +72,6 @@ module ActiveSupport | |
parameterized_string.gsub!(/^#{re_sep}|#{re_sep}$/i, '') | |
end | |
parameterized_string.downcase | |
- end | |
+ end | |
end | |
end | |
diff --git a/activesupport/test/inflector_test_cases.rb b/activesupport/test/inflector_test_cases.rb | |
index 29f87ac..59515da 100644 | |
--- a/activesupport/test/inflector_test_cases.rb | |
+++ b/activesupport/test/inflector_test_cases.rb | |
@@ -188,7 +188,10 @@ module InflectorTestCases | |
StringToParameterizedAndNormalized = { | |
"Malmö" => "malmo", | |
"Garçons" => "garcons", | |
- "Ops\331" => "ops" | |
+ "Ops\331" => "opsu", | |
+ "Ærøskøbing" => "aeroskobing", | |
+ "Aßlar" => "asslar", | |
+ "Japanese: 日本語" => "japanese" | |
} | |
UnderscoreToHuman = { | |
diff --git a/activesupport/test/transliterate_test.rb b/activesupport/test/transliterate_test.rb | |
new file mode 100644 | |
index 0000000..d689b6b | |
--- /dev/null | |
+++ b/activesupport/test/transliterate_test.rb | |
@@ -0,0 +1,50 @@ | |
+# encoding: utf-8 | |
+require 'abstract_unit' | |
+require 'active_support/inflector/transliterate' | |
+ | |
+class TransliterateTest < Test::Unit::TestCase | |
+ | |
+ APPROXIMATIONS = { | |
+ "À"=>"A", "Á"=>"A", "Â"=>"A", "Ã"=>"A", "Ä"=>"A", "Å"=>"A", "Æ"=>"AE", | |
+ "Ç"=>"C", "È"=>"E", "É"=>"E", "Ê"=>"E", "Ë"=>"E", "Ì"=>"I", "Í"=>"I", | |
+ "Î"=>"I", "Ï"=>"I", "Ð"=>"D", "Ñ"=>"N", "Ò"=>"O", "Ó"=>"O", "Ô"=>"O", | |
+ "Õ"=>"O", "Ö"=>"O", "Ø"=>"O", "Ù"=>"U", "Ú"=>"U", "Û"=>"U", "Ü"=>"U", | |
+ "Ý"=>"Y", "Þ"=>"Th", "ß"=>"ss", "à"=>"a", "á"=>"a", "â"=>"a", "ã"=>"a", | |
+ "ä"=>"a", "å"=>"a", "æ"=>"ae", "ç"=>"c", "è"=>"e", "é"=>"e", "ê"=>"e", | |
+ "ë"=>"e", "ì"=>"i", "í"=>"i", "î"=>"i", "ï"=>"i", "ð"=>"d", "ñ"=>"n", | |
+ "ò"=>"o", "ó"=>"o", "ô"=>"o", "õ"=>"o", "ö"=>"o", "ø"=>"o", "ù"=>"u", | |
+ "ú"=>"u", "û"=>"u", "ü"=>"u", "ý"=>"y", "þ"=>"th", "ÿ"=>"y", "Ā"=>"A", | |
+ "ā"=>"a", "Ă"=>"A", "ă"=>"a", "Ą"=>"A", "ą"=>"a", "Ć"=>"C", "ć"=>"c", | |
+ "Ĉ"=>"C", "ĉ"=>"c", "Ċ"=>"C", "ċ"=>"c", "Č"=>"C", "č"=>"c", "Ď"=>"D", | |
+ "ď"=>"d", "Đ"=>"D", "đ"=>"d", "Ē"=>"E", "ē"=>"e", "Ĕ"=>"E", "ĕ"=>"e", | |
+ "Ė"=>"E", "ė"=>"e", "Ę"=>"E", "ę"=>"e", "Ě"=>"E", "ě"=>"e", "Ĝ"=>"G", | |
+ "ĝ"=>"g", "Ğ"=>"G", "ğ"=>"g", "Ġ"=>"G", "ġ"=>"g", "Ģ"=>"G", "ģ"=>"g", | |
+ "Ĥ"=>"H", "ĥ"=>"h", "Ħ"=>"H", "ħ"=>"h", "Ĩ"=>"I", "ĩ"=>"i", "Ī"=>"I", | |
+ "ī"=>"i", "Ĭ"=>"I", "ĭ"=>"i", "Į"=>"I", "į"=>"i", "İ"=>"I", "ı"=>"i", | |
+ "IJ"=>"IJ", "ij"=>"ij", "Ĵ"=>"J", "ĵ"=>"j", "Ķ"=>"K", "ķ"=>"k", "ĸ"=>"k", | |
+ "Ĺ"=>"L", "ĺ"=>"l", "Ļ"=>"L", "ļ"=>"l", "Ľ"=>"L", "ľ"=>"l", "Ŀ"=>"L", | |
+ "ŀ"=>"l", "Ł"=>"L", "ł"=>"l", "Ń"=>"N", "ń"=>"n", "Ņ"=>"N", "ņ"=>"n", | |
+ "Ň"=>"N", "ň"=>"n", "ʼn"=>"n", "Ŋ"=>"NG", "ŋ"=>"ng", "Ō"=>"O", "ō"=>"o", | |
+ "Ŏ"=>"O", "ŏ"=>"o", "Ő"=>"O", "ő"=>"o", "Œ"=>"OE", "œ"=>"oe", "Ŕ"=>"R", | |
+ "ŕ"=>"r", "Ŗ"=>"R", "ŗ"=>"r", "Ř"=>"R", "ř"=>"r", "Ś"=>"S", "ś"=>"s", | |
+ "Ŝ"=>"S", "ŝ"=>"s", "Ş"=>"S", "ş"=>"s", "Š"=>"S", "š"=>"s", "Ţ"=>"T", | |
+ "ţ"=>"t", "Ť"=>"T", "ť"=>"t", "Ŧ"=>"T", "ŧ"=>"t", "Ũ"=>"U", "ũ"=>"u", | |
+ "Ū"=>"U", "ū"=>"u", "Ŭ"=>"U", "ŭ"=>"u", "Ů"=>"U", "ů"=>"u", "Ű"=>"U", | |
+ "ű"=>"u", "Ų"=>"U", "ų"=>"u", "Ŵ"=>"W", "ŵ"=>"w", "Ŷ"=>"Y", "ŷ"=>"y", | |
+ "Ÿ"=>"Y", "Ź"=>"Z", "ź"=>"z", "Ż"=>"Z", "ż"=>"z", "Ž"=>"Z", "ž"=>"z" | |
+ } | |
+ | |
+ def test_transliterate_should_not_change_ascii_chars | |
+ (0..127).each do |byte| | |
+ char = [byte].pack("U") | |
+ assert_equal char, ActiveSupport::Inflector.transliterate(char) | |
+ end | |
+ end | |
+ | |
+ def test_should_convert_accented_chars_to_approximate_ascii_chars | |
+ APPROXIMATIONS.each do |given, expected| | |
+ assert_equal expected, ActiveSupport::Inflector.transliterate(given) | |
+ end | |
+ end | |
+ | |
+end | |
-- | |
1.7.0.3 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment