Created

Embed URL

HTTPS clone URL

SSH clone URL

You can clone with HTTPS or SSH.

Download Gist

Test valid UTF-8

View valid_utf8.rb
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
# Tests str for valid UTF-8 as described here:
# https://secure.wikimedia.org/wikipedia/en/wiki/Utf8
#
# The following options are supported:
#
# :bmp_only Basic Multilingual Plane: only 1,2 byte characters are valid
# :debug prints debug info to stdout in case of an error
#
def valid_utf8?(str, options={})
debug = options[:debug] || false
bmp_only = options[:bmp_only] || false
rv = true
utf8_char = Array.new
str.each_byte do |c|
a_size = utf8_char.size
case c
when 0
rv = false
puts "illegal 0 char detected" if debug
break
when 128..191
# continuation byte must fit inside appropriate array
if utf8_char[0]
case utf8_char[0]
when 194..223
# 2 byte sequence
if a_size != 1
rv = false
puts "illegal 2 byte sequence detected (at #{c})" if debug
break
end
# character is valid
utf8_char.clear
 
when 224..239
# 3 byte sequence
if bmp_only
rv = false
puts "only BMP sequences allowed (but 3 byte sequence detected (at #{c}))" if debug
break
end
if not (a_size >= 1 and a_size < 3)
rv = false
puts "illegal 3 byte sequence detected (at #{c})" if debug
break
end
# character is valid
utf8_char << c
utf8_char.clear if utf8_char.size == 3
 
when 240..244
# 4 byte sequence
if bmp_only
rv = false
puts "only BMP sequences allowed (but 4 byte sequence detected (at #{c}))" if debug
break
end
if not (a_size >= 1 and a_size < 4)
rv = false
puts "illegal 4 byte sequence detected (at #{c})" if debug
break
end
# character is valid
utf8_char << c
utf8_char.clear if utf8_char.size == 4
 
else
# this should already be catched one iteration up
rv = false
puts "illegal start byte detected (#{utf8_char[0]})" if debug
break
end
else
rv = false
puts "illegal continuation char detected #{c}" if debug
break
end
 
when 194..244
# start byte of multibyte sequence
if utf8_char[0]
rv = false
puts "illegal start byte detected (#{utf8_char[0]})" if debug
break
end
utf8_char << c
 
when 192, 193, 245..255
# invalid bytes
rv = false
puts "illegal byte detected (#{c})" if debug
break
 
else
# 1..127 is all right
if utf8_char[0]
rv = false
puts "illegal regular byte detected (#{c} after #{utf8_char[0]})" if debug
break
end
end
end
 
if rv and not utf8_char.empty?
rv = false
if debug
puts "incomplete byte sequence detected:"
utf8_char.each { |byte| print " #{byte}"}
puts "\n"
end
end
rv
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.