Created
May 17, 2018 07:11
-
-
Save Glutexo/c9db9a059adfff6a0f9390f061b972eb to your computer and use it in GitHub Desktop.
Practical demo showing manual UTF-8 decoding
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Practical demo showing manual UTF-8 decoding. Ispired by an article by Tim Bray [Characters vs. | |
# Bytes](http://www.tbray.org/ongoing/When/200x/2003/04/26/UTF). | |
# These are characters mentioned in the article. Their UTF-8 represenation is written as an array of hex-encoded bytes. | |
utf8_chars = [ | |
%w(26), # & | |
%w(D0 96), # Ж | |
%w(E4 B8 AD), # 中 | |
%w(F0 90 8D 86) # 𐍆 | |
] | |
utf8_chars.each do |bytes_hex| | |
# Convert the hex-encoded bytes to actual binary_data and then to a number for binary operations. | |
binary_data = bytes_hex.pack('H2' * bytes_hex.size) | |
bytes_i = binary_data.unpack('C*') | |
# Extract the payload parts from the bytes, leaving the signaling bits alone. This is simplified as no validation is | |
# done. Invalid UTF-8 sequences would result in an invalid output. | |
payload = 0 | |
bytes_i.each do |byte| | |
pos_dec = 7 # An octet. | |
capture = false | |
while pos_dec >= 0 | |
pos_bin = 2 ** pos_dec | |
bit = byte & pos_bin | |
if capture | |
# Append the payload bit of the current byte. | |
payload <<= 1 | |
payload += 1 if bit > 0 | |
elsif bit == 0 | |
# This is the simplification. We start getting the payload after the first zero bit. We are not counting the | |
# signaling ones to validate the sequence. | |
capture = true | |
end | |
pos_dec = pos_dec.pred | |
end | |
end | |
# Treat the payload as a UTF-16 character. Encoded as UTF-8 the result should mach the original bytes. | |
char = payload.chr(Encoding::UTF_16BE).encode(Encoding::UTF_8) | |
unicode_number = payload.to_s(16).rjust(4, '0') | |
puts("U+#{unicode_number} #{char}") | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment