Glutexo/utf8_decode_demo.rb

## utf8_decode_demo.rb
# Practical demo showing manual UTF-8 decoding. Ispired by an article by Tim Bray [Characters vs.
# Bytes](http://www.tbray.org/ongoing/When/200x/2003/04/26/UTF).

# These are characters mentioned in the article. Their UTF-8 represenation is written as an array of hex-encoded bytes.
utf8_chars = [
  %w(26), # &
  %w(D0 96), # Ж
  %w(E4 B8 AD), # 中
  %w(F0 90 8D 86) # 𐍆
]
utf8_chars.each do |bytes_hex|
  # Convert the hex-encoded bytes to actual binary_data and then to a number for binary operations.
  binary_data = bytes_hex.pack('H2' * bytes_hex.size)
  bytes_i = binary_data.unpack('C*')

  # Extract the payload parts from the bytes, leaving the signaling bits alone. This is simplified as no validation is
  # done. Invalid UTF-8 sequences would result in an invalid output.
  payload = 0
  bytes_i.each do |byte|
    pos_dec = 7 # An octet.
    capture = false
    while pos_dec >= 0
      pos_bin = 2 ** pos_dec
      bit = byte & pos_bin
      if capture
        # Append the payload bit of the current byte.
        payload <<= 1
        payload += 1 if bit > 0
      elsif bit == 0
        # This is the simplification. We start getting the payload after the first zero bit. We are not counting the
        # signaling ones to validate the sequence.
        capture = true
      end
      pos_dec = pos_dec.pred
    end
  end

  # Treat the payload as a UTF-16 character. Encoded as UTF-8 the result should mach the original bytes.
  char = payload.chr(Encoding::UTF_16BE).encode(Encoding::UTF_8)
  unicode_number = payload.to_s(16).rjust(4, '0')
  puts("U+#{unicode_number} #{char}")
end
	# Practical demo showing manual UTF-8 decoding. Ispired by an article by Tim Bray [Characters vs.
	# Bytes](http://www.tbray.org/ongoing/When/200x/2003/04/26/UTF).

	# These are characters mentioned in the article. Their UTF-8 represenation is written as an array of hex-encoded bytes.
	utf8_chars = [
	%w(26), # &
	%w(D0 96), # Ж
	%w(E4 B8 AD), # 中
	%w(F0 90 8D 86) # 𐍆
	]
	utf8_chars.each do \|bytes_hex\|
	# Convert the hex-encoded bytes to actual binary_data and then to a number for binary operations.
	binary_data = bytes_hex.pack('H2' * bytes_hex.size)
	bytes_i = binary_data.unpack('C*')

	# Extract the payload parts from the bytes, leaving the signaling bits alone. This is simplified as no validation is
	# done. Invalid UTF-8 sequences would result in an invalid output.
	payload = 0
	bytes_i.each do \|byte\|
	pos_dec = 7 # An octet.
	capture = false
	while pos_dec >= 0
	pos_bin = 2 ** pos_dec
	bit = byte & pos_bin
	if capture
	# Append the payload bit of the current byte.
	payload <<= 1
	payload += 1 if bit > 0
	elsif bit == 0
	# This is the simplification. We start getting the payload after the first zero bit. We are not counting the
	# signaling ones to validate the sequence.
	capture = true
	end
	pos_dec = pos_dec.pred
	end
	end

	# Treat the payload as a UTF-16 character. Encoded as UTF-8 the result should mach the original bytes.
	char = payload.chr(Encoding::UTF_16BE).encode(Encoding::UTF_8)
	unicode_number = payload.to_s(16).rjust(4, '0')
	puts("U+#{unicode_number} #{char}")
	end