Skip to content

Instantly share code, notes, and snippets.

@wezm
Created July 29, 2009 07:19
Show Gist options
  • Save wezm/157912 to your computer and use it in GitHub Desktop.
Save wezm/157912 to your computer and use it in GitHub Desktop.
require 'active_support/core_ext/string'
module CleanURIs#:nodoc:
# Having these methods as module methods instead of in just a simple mixin
# enables stubbing them more easily (which is important when File system
# access is stubbed in tests (as it should) because the unicode-support uses
# filesystem access to generate some lookup tables and will fail. So it needs
# to be stubbed out aswell
module UTF8Helper#:nodoc:
extend self
def ascii_str_from_str(str)
str.mb_chars.normalize(:kd).unpack('U*').select{ |cp| cp < 127 }.pack('U*').to_s
end
def transliterate_str(str)
# based on table by Nicolas Holzheu (http://www.notsostupid.com/blog/2006/07/07/urls-on-rails/#comment-4373)
translit_table = {
'ÀÁÂÃĀĄĂ' => 'A', 'ìíîïīĩĭįı' =>'i', 'ŕřŗ' =>'r',
'Ä' => 'Ae', 'IJ' => 'IJ', 'ŚŠŞŜȘ' => 'S',
'àáâãāąă' => 'a', 'Ĵ' => 'J', 'śšşŝș' => 's',
'ä' => 'ae', 'ĵ' => 'j', 'ŤŢŦȚ' => 'T',
'Æ' => 'Ae', 'Ķ' => 'K', 'ťţŧț' => 't',
'æ' => 'ae', 'ķĸ' => 'k', 'ÙÚÛŪŮŰŬŨŲ' =>'U',
'ÇĆČĈĊ' => 'C', 'ŁĽĹĻĿ' => 'L', 'Ü' => 'Ue',
'çćčĉċ' => 'c', 'łľĺļŀ' => 'l', 'ùúûūůűŭũų' =>'u',
'ĎĐÐ' => 'D', 'ÑŃŇŅŊ' => 'N', 'ü' => 'ue',
'ďđð' => 'd', 'ñńňņʼnŋ' => 'n', 'Ŵ' => 'W',
'ÈÉÊËĒĘĚĔĖ' =>'E', 'ÒÓÔÕØŌŐŎ' => 'O', 'ŵ' => 'w',
'èéêëēęěĕė' =>'e', 'Ö' => 'Oe', 'ÝŶŸ' =>'Y',
'ƒ' => 'f', 'òóôõøōőŏ' => 'o', 'ýÿŷ' =>'y',
'ĜĞĠĢ' => 'G', 'ö' => 'oe', 'ŹŽŻ' =>'Z',
'ĝğġģ' => 'g', 'Œ' => 'OE', 'žżź' =>'z',
'ĤĦ' => 'H', 'œ' => 'oe', 'Å' => 'Aa',
'ĥħ' => 'h', 'ŔŘŖ' =>'R', 'å' => 'aa',
'ÌÍÎÏĪĨĬĮİ' =>'I', 'ß' => 'ss', '×' => 'x',
}
result = str
translit_table.each do |originals, substitution|
result = result.gsub %r([#{originals}]), substitution
end
result.to_ascii
end
def uri_slug_from_str(str, separator)
str.transliterate.downcase.separate(separator).strip_non_alphanum_except(separator).separate(separator)
end
def separate_str(str, separator)
str.gsub(/[\s\+\-–—\_]/, separator).gsub(/#{Regexp.escape(separator)}+/, separator)
end
end
module CoreExtensions#:nodoc:
module String
# Returns a copy of the string that contains only ascii characters
# KD-Normalizes Unicode characters before stripping non-ascii
#
# This only performs “compatibility equivalence” according to Unicode
# That means it doesn't transform a whole lot of characters that most
# “normal” people might consider equivalent,
# e.g. × (MULTILICATION SIGN) to x (LATIN SMALL LETTER X) or
# æ (LATIN SMALL LETTER AE) => ae. To get that behaviour for a number of
# common signs use #transliterate
#
# Assumes the string is in UTF-8 encoding which is probably not a good idea
# for Ruby 1.9 / Ruby 2
def to_ascii
CleanURIs::UTF8Helper.ascii_str_from_str(self)
end
# Transliterates common “special” characters into ASCII
# it relies on #to_ascii for the bulk of the work and augments that with
# a transliteration table for special cases. This table is not complete
# (and doesn't aim to be). It is at the moment also utterly ignorant of
# anything non-western and covers mostly typical European transliterations.
# Patches welcome.
#
# Gotcha: some transliterations contain upper- and lowercase characters
# (i.e. Ä => Ae) if the desired result is uppercase-only this method should be
# followed by a call to upcase
#
# HACK: This knows nothing about languages, and transliterations not only
# depend on the source encoding, but also on the source and target languages
# Implementing “proper” transliteration is pretty much the same as translation
# Also, being ignorant about languages and locales has the advantage of
# producing reliable results cross-system, unlike using Iconv
def transliterate
CleanURIs::UTF8Helper.transliterate_str(self)
end
# Returns copy of string suitable for a clean uri-fragment
# spaces and other delimiters are replaced by +separator+ (default: -)
# all non-ascii-alphanumeric characters are removed
def to_uri_slug(separator='-')
CleanURIs::UTF8Helper.uri_slug_from_str(self, separator)
end
def to_dom_id
to_uri_slug('_')
end
# Returns a copy of the string with all words joined by +separator+
def separate(separator)
CleanURIs::UTF8Helper.separate_str(self, separator)
end
# Returns a copy of the string with all non-ascii-alphanumeric characters
# removed
# If +exception+ is set, it also doesn't remove that character
def strip_non_alphanum_except(exception=nil)
pattern = /[^a-z0-9#{Regexp.escape(exception)}]/
self.gsub(pattern, '')
end
end
end
end
module PosessiveForm
def posessivize
suffix = (self.mb_chars.last.to_s == 's')? '’' : '’s'
self + suffix
end
end
class String
include PosessiveForm
include CleanURIs::CoreExtensions::String
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment