Created
April 3, 2012 21:12
-
-
Save lumpidu/2295531 to your computer and use it in GitHub Desktop.
Test valid UTF-8
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Tests str for valid UTF-8 as described here: | |
# https://secure.wikimedia.org/wikipedia/en/wiki/Utf8 | |
# | |
# The following options are supported: | |
# | |
# :bmp_only Basic Multilingual Plane: only 1,2 byte characters are valid | |
# :debug prints debug info to stdout in case of an error | |
# | |
def valid_utf8?(str, options={}) | |
debug = options[:debug] || false | |
bmp_only = options[:bmp_only] || false | |
rv = true | |
utf8_char = Array.new | |
str.each_byte do |c| | |
a_size = utf8_char.size | |
case c | |
when 0 | |
rv = false | |
puts "illegal 0 char detected" if debug | |
break | |
when 128..191 | |
# continuation byte must fit inside appropriate array | |
if utf8_char[0] | |
case utf8_char[0] | |
when 194..223 | |
# 2 byte sequence | |
if a_size != 1 | |
rv = false | |
puts "illegal 2 byte sequence detected (at #{c})" if debug | |
break | |
end | |
# character is valid | |
utf8_char.clear | |
when 224..239 | |
# 3 byte sequence | |
if bmp_only | |
rv = false | |
puts "only BMP sequences allowed (but 3 byte sequence detected (at #{c}))" if debug | |
break | |
end | |
if not (a_size >= 1 and a_size < 3) | |
rv = false | |
puts "illegal 3 byte sequence detected (at #{c})" if debug | |
break | |
end | |
# character is valid | |
utf8_char << c | |
utf8_char.clear if utf8_char.size == 3 | |
when 240..244 | |
# 4 byte sequence | |
if bmp_only | |
rv = false | |
puts "only BMP sequences allowed (but 4 byte sequence detected (at #{c}))" if debug | |
break | |
end | |
if not (a_size >= 1 and a_size < 4) | |
rv = false | |
puts "illegal 4 byte sequence detected (at #{c})" if debug | |
break | |
end | |
# character is valid | |
utf8_char << c | |
utf8_char.clear if utf8_char.size == 4 | |
else | |
# this should already be catched one iteration up | |
rv = false | |
puts "illegal start byte detected (#{utf8_char[0]})" if debug | |
break | |
end | |
else | |
rv = false | |
puts "illegal continuation char detected #{c}" if debug | |
break | |
end | |
when 194..244 | |
# start byte of multibyte sequence | |
if utf8_char[0] | |
rv = false | |
puts "illegal start byte detected (#{utf8_char[0]})" if debug | |
break | |
end | |
utf8_char << c | |
when 192, 193, 245..255 | |
# invalid bytes | |
rv = false | |
puts "illegal byte detected (#{c})" if debug | |
break | |
else | |
# 1..127 is all right | |
if utf8_char[0] | |
rv = false | |
puts "illegal regular byte detected (#{c} after #{utf8_char[0]})" if debug | |
break | |
end | |
end | |
end | |
if rv and not utf8_char.empty? | |
rv = false | |
if debug | |
puts "incomplete byte sequence detected:" | |
utf8_char.each { |byte| print " #{byte}"} | |
puts "\n" | |
end | |
end | |
rv | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment