public
Created

Test valid UTF-8

  • Download Gist
valid_utf8.rb
Ruby
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
# Tests str for valid UTF-8 as described here:
# https://secure.wikimedia.org/wikipedia/en/wiki/Utf8
#
# The following options are supported:
#
# :bmp_only Basic Multilingual Plane: only 1,2 byte characters are valid
# :debug prints debug info to stdout in case of an error
#
def valid_utf8?(str, options={})
debug = options[:debug] || false
bmp_only = options[:bmp_only] || false
rv = true
utf8_char = Array.new
str.each_byte do |c|
a_size = utf8_char.size
case c
when 0
rv = false
puts "illegal 0 char detected" if debug
break
when 128..191
# continuation byte must fit inside appropriate array
if utf8_char[0]
case utf8_char[0]
when 194..223
# 2 byte sequence
if a_size != 1
rv = false
puts "illegal 2 byte sequence detected (at #{c})" if debug
break
end
# character is valid
utf8_char.clear
 
when 224..239
# 3 byte sequence
if bmp_only
rv = false
puts "only BMP sequences allowed (but 3 byte sequence detected (at #{c}))" if debug
break
end
if not (a_size >= 1 and a_size < 3)
rv = false
puts "illegal 3 byte sequence detected (at #{c})" if debug
break
end
# character is valid
utf8_char << c
utf8_char.clear if utf8_char.size == 3
 
when 240..244
# 4 byte sequence
if bmp_only
rv = false
puts "only BMP sequences allowed (but 4 byte sequence detected (at #{c}))" if debug
break
end
if not (a_size >= 1 and a_size < 4)
rv = false
puts "illegal 4 byte sequence detected (at #{c})" if debug
break
end
# character is valid
utf8_char << c
utf8_char.clear if utf8_char.size == 4
 
else
# this should already be catched one iteration up
rv = false
puts "illegal start byte detected (#{utf8_char[0]})" if debug
break
end
else
rv = false
puts "illegal continuation char detected #{c}" if debug
break
end
 
when 194..244
# start byte of multibyte sequence
if utf8_char[0]
rv = false
puts "illegal start byte detected (#{utf8_char[0]})" if debug
break
end
utf8_char << c
 
when 192, 193, 245..255
# invalid bytes
rv = false
puts "illegal byte detected (#{c})" if debug
break
 
else
# 1..127 is all right
if utf8_char[0]
rv = false
puts "illegal regular byte detected (#{c} after #{utf8_char[0]})" if debug
break
end
end
end
 
if rv and not utf8_char.empty?
rv = false
if debug
puts "incomplete byte sequence detected:"
utf8_char.each { |byte| print " #{byte}"}
puts "\n"
end
end
rv
end

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.