norman/encode.rb

## encode.rb
# coding: utf-8
#
# Copyright (c) 2012 Norman Clarke
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

class String
  # Encodes codepoints that are either given in the +unsafe_chars+ argument or
  # lie outside the ASCII printable range to an HTML character reference
  # (http://bit.ly/KNupLT). Note that the string must be encodable to UTF-8 for
  # this to work properly. The +unsafe_chars+ argument defaults to +['<', '>',
  # '&', '"', "'"]+.
  def reference_encode(unsafe_chars = ['<', '>', '&', '"', "'"])
    unsafe_chars = unsafe_chars.map(&:ord)
    encode(Encoding::UTF_8).each_codepoint.inject("") do |buffer, cp|
      cp = "&#x#{cp.to_s(16)};" if unsafe_chars.include?(cp) || cp < 0x20 || cp > 0x7E
      buffer << cp
    end
  end

  # Decode HTML character references in a string to their UTF-8 equivalents.
  def reference_decode
    gsub(/&#x([a-z0-9]*);/) {$1.to_i(16).chr(Encoding::UTF_8)}
  end
end

require "test/unit"

class ReferenceEncoderTest < Test::Unit::TestCase
  def test_ascii
    encoded, decoded = "Japan", "Japan"
    assert_equal decoded, encoded.reference_decode
    assert_equal encoded, decoded.reference_encode
  end

  def test_low_unicode
    encoded, decoded = "Jap&#xf3;n", "Japón"
    assert_equal decoded, encoded.reference_decode
    assert_equal encoded, decoded.reference_encode
  end

  def test_high_unicode
    encoded, decoded = "&#x65e5;&#x672c;", "日本"
    assert_equal decoded, encoded.reference_decode
    assert_equal encoded, decoded.reference_encode
  end

  def test_non_unicode
    # Use Mac Roman because the ISO-8859-1 codepoint happens to be the same
    # as UTF-8's.
    encoded, decoded = "Jap&#xf3;n", "Japón".encode("macRoman")
    assert_equal "Japón", encoded.reference_decode
    assert_equal encoded, decoded.reference_encode
  end

  def test_unsafe_chars
    assert_equal "&#x3c;&#x26;&#x27;&#x22;&#x3e;", "<&'\">".reference_encode
    assert_equal "<&#x61;>", "<a>".reference_encode(["a"])
  end
end
	# coding: utf-8
	#
	# Copyright (c) 2012 Norman Clarke
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy of
	# this software and associated documentation files (the "Software"), to deal in
	# the Software without restriction, including without limitation the rights to
	# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
	# the Software, and to permit persons to whom the Software is furnished to do so,
	# subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
	# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
	# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
	# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
	# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

	class String
	# Encodes codepoints that are either given in the +unsafe_chars+ argument or
	# lie outside the ASCII printable range to an HTML character reference
	# (http://bit.ly/KNupLT). Note that the string must be encodable to UTF-8 for
	# this to work properly. The +unsafe_chars+ argument defaults to +['<', '>',
	# '&', '"', "'"]+.
	def reference_encode(unsafe_chars = ['<', '>', '&', '"', "'"])
	unsafe_chars = unsafe_chars.map(&:ord)
	encode(Encoding::UTF_8).each_codepoint.inject("") do \|buffer, cp\|
	cp = "&#x#{cp.to_s(16)};" if unsafe_chars.include?(cp) \|\| cp < 0x20 \|\| cp > 0x7E
	buffer << cp
	end
	end

	# Decode HTML character references in a string to their UTF-8 equivalents.
	def reference_decode
	gsub(/&#x([a-z0-9]*);/) {$1.to_i(16).chr(Encoding::UTF_8)}
	end
	end

	require "test/unit"

	class ReferenceEncoderTest < Test::Unit::TestCase
	def test_ascii
	encoded, decoded = "Japan", "Japan"
	assert_equal decoded, encoded.reference_decode
	assert_equal encoded, decoded.reference_encode
	end

	def test_low_unicode
	encoded, decoded = "Japón", "Japón"
	assert_equal decoded, encoded.reference_decode
	assert_equal encoded, decoded.reference_encode
	end

	def test_high_unicode
	encoded, decoded = "日本", "日本"
	assert_equal decoded, encoded.reference_decode
	assert_equal encoded, decoded.reference_encode
	end

	def test_non_unicode
	# Use Mac Roman because the ISO-8859-1 codepoint happens to be the same
	# as UTF-8's.
	encoded, decoded = "Japón", "Japón".encode("macRoman")
	assert_equal "Japón", encoded.reference_decode
	assert_equal encoded, decoded.reference_encode
	end

	def test_unsafe_chars
	assert_equal "<&'">", "<&'\">".reference_encode
	assert_equal "<a>", "<a>".reference_encode(["a"])
	end
	end