wezm/gist:157912

## gistfile1.rb
require 'active_support/core_ext/string'

module CleanURIs#:nodoc:

  # Having these methods as module methods instead of in just a simple mixin
  # enables stubbing them more easily (which is important when File system
  # access is stubbed in tests (as it should) because the unicode-support uses
  # filesystem access to generate some lookup tables and will fail. So it needs
  # to be stubbed out aswell
  module UTF8Helper#:nodoc:
    extend self

    def ascii_str_from_str(str)
      str.mb_chars.normalize(:kd).unpack('U*').select{ |cp| cp < 127 }.pack('U*').to_s
    end

    def transliterate_str(str)
      # based on table by Nicolas Holzheu (http://www.notsostupid.com/blog/2006/07/07/urls-on-rails/#comment-4373)
      translit_table = {
       'ÀÁÂÃĀĄĂ' => 'A',        'ìíîïīĩĭįı' =>'i',         'ŕřŗ' =>'r',
       'Ä' => 'Ae',             'Ĳ' => 'IJ',               'ŚŠŞŜȘ' => 'S',
       'àáâãāąă' => 'a',        'Ĵ' => 'J',                'śšşŝș' => 's',
       'ä' => 'ae',             'ĵ' => 'j',                'ŤŢŦȚ' => 'T',
       'Æ' => 'Ae',             'Ķ' => 'K',                'ťţŧț' => 't',
       'æ' => 'ae',             'ķĸ' => 'k',               'ÙÚÛŪŮŰŬŨŲ' =>'U',
       'ÇĆČĈĊ' => 'C',          'ŁĽĹĻĿ' => 'L',            'Ü' => 'Ue',
       'çćčĉċ' => 'c',          'łľĺļŀ' => 'l',            'ùúûūůűŭũų' =>'u',
       'ĎĐÐ' => 'D',            'ÑŃŇŅŊ' => 'N',            'ü' => 'ue',
       'ďđð' => 'd',            'ñńňņŉŋ' => 'n',           'Ŵ' => 'W',
       'ÈÉÊËĒĘĚĔĖ' =>'E',       'ÒÓÔÕØŌŐŎ' => 'O',         'ŵ' => 'w',
       'èéêëēęěĕė' =>'e',       'Ö' => 'Oe',               'ÝŶŸ' =>'Y',
       'ƒ' => 'f',              'òóôõøōőŏ' => 'o',         'ýÿŷ' =>'y',
       'ĜĞĠĢ' => 'G',           'ö' => 'oe',               'ŹŽŻ' =>'Z',
       'ĝğġģ' => 'g',           'Œ' => 'OE',               'žżź' =>'z',
       'ĤĦ' => 'H',             'œ' => 'oe',               'Å' => 'Aa',
       'ĥħ' => 'h',             'ŔŘŖ' =>'R',               'å' => 'aa',
       'ÌÍÎÏĪĨĬĮİ' =>'I',       'ß' => 'ss',               '×' => 'x',
      }

      result = str
      translit_table.each do |originals, substitution|
        result = result.gsub %r([#{originals}]), substitution
      end
      result.to_ascii
    end

    def uri_slug_from_str(str, separator)
      str.transliterate.downcase.separate(separator).strip_non_alphanum_except(separator).separate(separator)
    end

    def separate_str(str, separator)
      str.gsub(/[\s\+\-–—\_]/, separator).gsub(/#{Regexp.escape(separator)}+/, separator)
    end

  end

  module CoreExtensions#:nodoc:
    module String

      # Returns a copy of the string that contains only ascii characters
      # KD-Normalizes Unicode characters before stripping non-ascii
      #
      # This only performs “compatibility equivalence” according to Unicode
      # That means it doesn't transform a whole lot of characters that most
      # “normal” people might consider equivalent,
      # e.g. × (MULTILICATION SIGN) to x (LATIN SMALL LETTER X) or
      # æ (LATIN SMALL LETTER AE) => ae. To get that behaviour for a number of
      # common signs use #transliterate
      #
      # Assumes the string is in UTF-8 encoding which is probably not a good idea
      # for Ruby 1.9 / Ruby 2
      def to_ascii
        CleanURIs::UTF8Helper.ascii_str_from_str(self)
      end

      # Transliterates common “special” characters into ASCII
      # it relies on #to_ascii for the bulk of the work and augments that with
      # a transliteration table for special cases. This table is not complete
      # (and doesn't aim to be). It is at the moment also utterly ignorant of
      # anything non-western and covers mostly typical European transliterations.
      # Patches welcome.
      #
      # Gotcha: some transliterations contain upper- and lowercase characters
      # (i.e. Ä => Ae) if the desired result is uppercase-only this method should be
      # followed by a call to upcase
      #
      # HACK: This knows nothing about languages, and transliterations not only
      # depend on the source encoding, but also on the source and target languages
      # Implementing “proper” transliteration is pretty much the same as translation
      # Also, being ignorant about languages and locales has the advantage of
      # producing reliable results cross-system, unlike using Iconv
      def transliterate
        CleanURIs::UTF8Helper.transliterate_str(self)
      end

      # Returns copy of string suitable for a clean uri-fragment
      # spaces and other delimiters are replaced by +separator+ (default: -)
      # all non-ascii-alphanumeric characters are removed
      def to_uri_slug(separator='-')
        CleanURIs::UTF8Helper.uri_slug_from_str(self, separator)
      end
      def to_dom_id
        to_uri_slug('_')
      end

      # Returns a copy of the string with all words joined by +separator+
      def separate(separator)
        CleanURIs::UTF8Helper.separate_str(self, separator)
      end

      # Returns a copy of the string with all non-ascii-alphanumeric characters
      # removed
      # If +exception+ is set, it also doesn't remove that character
      def strip_non_alphanum_except(exception=nil)
        pattern = /[^a-z0-9#{Regexp.escape(exception)}]/
        self.gsub(pattern, '')
      end

    end
  end
end

module PosessiveForm
  def posessivize
    suffix = (self.mb_chars.last.to_s == 's')? '’' : '’s'
    self + suffix
  end
end

class String
  include PosessiveForm
  include CleanURIs::CoreExtensions::String
end
	require 'active_support/core_ext/string'

	module CleanURIs#:nodoc:

	# Having these methods as module methods instead of in just a simple mixin
	# enables stubbing them more easily (which is important when File system
	# access is stubbed in tests (as it should) because the unicode-support uses
	# filesystem access to generate some lookup tables and will fail. So it needs
	# to be stubbed out aswell
	module UTF8Helper#:nodoc:
	extend self

	def ascii_str_from_str(str)
	str.mb_chars.normalize(:kd).unpack('U').select{ \|cp\| cp < 127 }.pack('U').to_s
	end

	def transliterate_str(str)
	# based on table by Nicolas Holzheu (http://www.notsostupid.com/blog/2006/07/07/urls-on-rails/#comment-4373)
	translit_table = {
	'ÀÁÂÃĀĄĂ' => 'A', 'ìíîïīĩĭįı' =>'i', 'ŕřŗ' =>'r',
	'Ä' => 'Ae', 'Ĳ' => 'IJ', 'ŚŠŞŜȘ' => 'S',
	'àáâãāąă' => 'a', 'Ĵ' => 'J', 'śšşŝș' => 's',
	'ä' => 'ae', 'ĵ' => 'j', 'ŤŢŦȚ' => 'T',
	'Æ' => 'Ae', 'Ķ' => 'K', 'ťţŧț' => 't',
	'æ' => 'ae', 'ķĸ' => 'k', 'ÙÚÛŪŮŰŬŨŲ' =>'U',
	'ÇĆČĈĊ' => 'C', 'ŁĽĹĻĿ' => 'L', 'Ü' => 'Ue',
	'çćčĉċ' => 'c', 'łľĺļŀ' => 'l', 'ùúûūůűŭũų' =>'u',
	'ĎĐÐ' => 'D', 'ÑŃŇŅŊ' => 'N', 'ü' => 'ue',
	'ďđð' => 'd', 'ñńňņŉŋ' => 'n', 'Ŵ' => 'W',
	'ÈÉÊËĒĘĚĔĖ' =>'E', 'ÒÓÔÕØŌŐŎ' => 'O', 'ŵ' => 'w',
	'èéêëēęěĕė' =>'e', 'Ö' => 'Oe', 'ÝŶŸ' =>'Y',
	'ƒ' => 'f', 'òóôõøōőŏ' => 'o', 'ýÿŷ' =>'y',
	'ĜĞĠĢ' => 'G', 'ö' => 'oe', 'ŹŽŻ' =>'Z',
	'ĝğġģ' => 'g', 'Œ' => 'OE', 'žżź' =>'z',
	'ĤĦ' => 'H', 'œ' => 'oe', 'Å' => 'Aa',
	'ĥħ' => 'h', 'ŔŘŖ' =>'R', 'å' => 'aa',
	'ÌÍÎÏĪĨĬĮİ' =>'I', 'ß' => 'ss', '×' => 'x',
	}

	result = str
	translit_table.each do \|originals, substitution\|
	result = result.gsub %r([#{originals}]), substitution
	end
	result.to_ascii
	end

	def uri_slug_from_str(str, separator)
	str.transliterate.downcase.separate(separator).strip_non_alphanum_except(separator).separate(separator)
	end

	def separate_str(str, separator)
	str.gsub(/[\s\+\-–—\_]/, separator).gsub(/#{Regexp.escape(separator)}+/, separator)
	end

	end

	module CoreExtensions#:nodoc:
	module String

	# Returns a copy of the string that contains only ascii characters
	# KD-Normalizes Unicode characters before stripping non-ascii
	#
	# This only performs “compatibility equivalence” according to Unicode
	# That means it doesn't transform a whole lot of characters that most
	# “normal” people might consider equivalent,
	# e.g. × (MULTILICATION SIGN) to x (LATIN SMALL LETTER X) or
	# æ (LATIN SMALL LETTER AE) => ae. To get that behaviour for a number of
	# common signs use #transliterate
	#
	# Assumes the string is in UTF-8 encoding which is probably not a good idea
	# for Ruby 1.9 / Ruby 2
	def to_ascii
	CleanURIs::UTF8Helper.ascii_str_from_str(self)
	end

	# Transliterates common “special” characters into ASCII
	# it relies on #to_ascii for the bulk of the work and augments that with
	# a transliteration table for special cases. This table is not complete
	# (and doesn't aim to be). It is at the moment also utterly ignorant of
	# anything non-western and covers mostly typical European transliterations.
	# Patches welcome.
	#
	# Gotcha: some transliterations contain upper- and lowercase characters
	# (i.e. Ä => Ae) if the desired result is uppercase-only this method should be
	# followed by a call to upcase
	#
	# HACK: This knows nothing about languages, and transliterations not only
	# depend on the source encoding, but also on the source and target languages
	# Implementing “proper” transliteration is pretty much the same as translation
	# Also, being ignorant about languages and locales has the advantage of
	# producing reliable results cross-system, unlike using Iconv
	def transliterate
	CleanURIs::UTF8Helper.transliterate_str(self)
	end

	# Returns copy of string suitable for a clean uri-fragment
	# spaces and other delimiters are replaced by +separator+ (default: -)
	# all non-ascii-alphanumeric characters are removed
	def to_uri_slug(separator='-')
	CleanURIs::UTF8Helper.uri_slug_from_str(self, separator)
	end
	def to_dom_id
	to_uri_slug('_')
	end

	# Returns a copy of the string with all words joined by +separator+
	def separate(separator)
	CleanURIs::UTF8Helper.separate_str(self, separator)
	end

	# Returns a copy of the string with all non-ascii-alphanumeric characters
	# removed
	# If +exception+ is set, it also doesn't remove that character
	def strip_non_alphanum_except(exception=nil)
	pattern = /[^a-z0-9#{Regexp.escape(exception)}]/
	self.gsub(pattern, '')
	end

	end
	end
	end

	module PosessiveForm
	def posessivize
	suffix = (self.mb_chars.last.to_s == 's')? '’' : '’s'
	self + suffix
	end
	end

	class String
	include PosessiveForm
	include CleanURIs::CoreExtensions::String
	end