Created
July 29, 2009 07:19
-
-
Save wezm/157912 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'active_support/core_ext/string' | |
module CleanURIs#:nodoc: | |
# Having these methods as module methods instead of in just a simple mixin | |
# enables stubbing them more easily (which is important when File system | |
# access is stubbed in tests (as it should) because the unicode-support uses | |
# filesystem access to generate some lookup tables and will fail. So it needs | |
# to be stubbed out aswell | |
module UTF8Helper#:nodoc: | |
extend self | |
def ascii_str_from_str(str) | |
str.mb_chars.normalize(:kd).unpack('U*').select{ |cp| cp < 127 }.pack('U*').to_s | |
end | |
def transliterate_str(str) | |
# based on table by Nicolas Holzheu (http://www.notsostupid.com/blog/2006/07/07/urls-on-rails/#comment-4373) | |
translit_table = { | |
'ÀÁÂÃĀĄĂ' => 'A', 'ìíîïīĩĭįı' =>'i', 'ŕřŗ' =>'r', | |
'Ä' => 'Ae', 'IJ' => 'IJ', 'ŚŠŞŜȘ' => 'S', | |
'àáâãāąă' => 'a', 'Ĵ' => 'J', 'śšşŝș' => 's', | |
'ä' => 'ae', 'ĵ' => 'j', 'ŤŢŦȚ' => 'T', | |
'Æ' => 'Ae', 'Ķ' => 'K', 'ťţŧț' => 't', | |
'æ' => 'ae', 'ķĸ' => 'k', 'ÙÚÛŪŮŰŬŨŲ' =>'U', | |
'ÇĆČĈĊ' => 'C', 'ŁĽĹĻĿ' => 'L', 'Ü' => 'Ue', | |
'çćčĉċ' => 'c', 'łľĺļŀ' => 'l', 'ùúûūůűŭũų' =>'u', | |
'ĎĐÐ' => 'D', 'ÑŃŇŅŊ' => 'N', 'ü' => 'ue', | |
'ďđð' => 'd', 'ñńňņʼnŋ' => 'n', 'Ŵ' => 'W', | |
'ÈÉÊËĒĘĚĔĖ' =>'E', 'ÒÓÔÕØŌŐŎ' => 'O', 'ŵ' => 'w', | |
'èéêëēęěĕė' =>'e', 'Ö' => 'Oe', 'ÝŶŸ' =>'Y', | |
'ƒ' => 'f', 'òóôõøōőŏ' => 'o', 'ýÿŷ' =>'y', | |
'ĜĞĠĢ' => 'G', 'ö' => 'oe', 'ŹŽŻ' =>'Z', | |
'ĝğġģ' => 'g', 'Œ' => 'OE', 'žżź' =>'z', | |
'ĤĦ' => 'H', 'œ' => 'oe', 'Å' => 'Aa', | |
'ĥħ' => 'h', 'ŔŘŖ' =>'R', 'å' => 'aa', | |
'ÌÍÎÏĪĨĬĮİ' =>'I', 'ß' => 'ss', '×' => 'x', | |
} | |
result = str | |
translit_table.each do |originals, substitution| | |
result = result.gsub %r([#{originals}]), substitution | |
end | |
result.to_ascii | |
end | |
def uri_slug_from_str(str, separator) | |
str.transliterate.downcase.separate(separator).strip_non_alphanum_except(separator).separate(separator) | |
end | |
def separate_str(str, separator) | |
str.gsub(/[\s\+\-–—\_]/, separator).gsub(/#{Regexp.escape(separator)}+/, separator) | |
end | |
end | |
module CoreExtensions#:nodoc: | |
module String | |
# Returns a copy of the string that contains only ascii characters | |
# KD-Normalizes Unicode characters before stripping non-ascii | |
# | |
# This only performs “compatibility equivalence” according to Unicode | |
# That means it doesn't transform a whole lot of characters that most | |
# “normal” people might consider equivalent, | |
# e.g. × (MULTILICATION SIGN) to x (LATIN SMALL LETTER X) or | |
# æ (LATIN SMALL LETTER AE) => ae. To get that behaviour for a number of | |
# common signs use #transliterate | |
# | |
# Assumes the string is in UTF-8 encoding which is probably not a good idea | |
# for Ruby 1.9 / Ruby 2 | |
def to_ascii | |
CleanURIs::UTF8Helper.ascii_str_from_str(self) | |
end | |
# Transliterates common “special” characters into ASCII | |
# it relies on #to_ascii for the bulk of the work and augments that with | |
# a transliteration table for special cases. This table is not complete | |
# (and doesn't aim to be). It is at the moment also utterly ignorant of | |
# anything non-western and covers mostly typical European transliterations. | |
# Patches welcome. | |
# | |
# Gotcha: some transliterations contain upper- and lowercase characters | |
# (i.e. Ä => Ae) if the desired result is uppercase-only this method should be | |
# followed by a call to upcase | |
# | |
# HACK: This knows nothing about languages, and transliterations not only | |
# depend on the source encoding, but also on the source and target languages | |
# Implementing “proper” transliteration is pretty much the same as translation | |
# Also, being ignorant about languages and locales has the advantage of | |
# producing reliable results cross-system, unlike using Iconv | |
def transliterate | |
CleanURIs::UTF8Helper.transliterate_str(self) | |
end | |
# Returns copy of string suitable for a clean uri-fragment | |
# spaces and other delimiters are replaced by +separator+ (default: -) | |
# all non-ascii-alphanumeric characters are removed | |
def to_uri_slug(separator='-') | |
CleanURIs::UTF8Helper.uri_slug_from_str(self, separator) | |
end | |
def to_dom_id | |
to_uri_slug('_') | |
end | |
# Returns a copy of the string with all words joined by +separator+ | |
def separate(separator) | |
CleanURIs::UTF8Helper.separate_str(self, separator) | |
end | |
# Returns a copy of the string with all non-ascii-alphanumeric characters | |
# removed | |
# If +exception+ is set, it also doesn't remove that character | |
def strip_non_alphanum_except(exception=nil) | |
pattern = /[^a-z0-9#{Regexp.escape(exception)}]/ | |
self.gsub(pattern, '') | |
end | |
end | |
end | |
end | |
module PosessiveForm | |
def posessivize | |
suffix = (self.mb_chars.last.to_s == 's')? '’' : '’s' | |
self + suffix | |
end | |
end | |
class String | |
include PosessiveForm | |
include CleanURIs::CoreExtensions::String | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment