Skip to content

Instantly share code, notes, and snippets.

@ScottPJones
Created May 23, 2015 22:57
Show Gist options
  • Save ScottPJones/f3fb082ac30d337d91bc to your computer and use it in GitHub Desktop.
Save ScottPJones/f3fb082ac30d337d91bc to your computer and use it in GitHub Desktop.
Test generic vs. separate methods to check UTF-16/UTF-32/AbstractString
# This file is a part of Julia. License is MIT: http://julialang.org/license
module CheckUTF
if VERSION < v"0.4-"
typealias AbstractString String
typealias UInt Uint
typealias UInt8 Uint8
typealias UInt16 Uint16
typealias UInt32 Uint32
end
#=
@doc """
@brief Error messages for Unicode / UTF support
""" ->
=#
const UTF_ERR_SHORT = 1
const UTF_ERR_CONT = 2
const UTF_ERR_LONG = 3
const UTF_ERR_NOT_LEAD = 4
const UTF_ERR_NOT_TRAIL = 5
const UTF_ERR_NOT_SURROGATE = 6
const UTF_ERR_MISSING_SURROGATE = 7
const UTF_ERR_INVALID = 8
const UTF_ERR_SURROGATE = 9
const UTF_ERR_NULL_16_TERMINATE = 10
const UTF_ERR_NULL_32_TERMINATE = 11
const UTF_ERR_MAX = 11
const errMsgs = [
"invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)",
"invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)",
"invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)",
"not a leading Unicode surrogate character at index <<1>> (0x<<2>>)",
"not a trailing Unicode surrogate character at index <<1>> (0x<<2>>)",
"not a valid Unicode surrogate character at index <<1>> (0x<<2>>",
"missing trailing Unicode surrogate character after index <<1>> (0x<<2>>)",
"invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)",
"surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)",
"UTF16String data must be NULL-terminated",
"UTF32String data must be NULL-terminated"
]
#=
@doc """
@brief Throws ArgumentError with information about the specific error, location, and character
@param[in] errCode::UTF_ERR
@param[in] errPos:: Integer
@param[in] errChar::Integer
@throws never returns, always throws ArgumentError
""" ->
=#
function utf_errfunc(errCode::Integer, errPos::Integer, errChar::Integer)
if errCode < 1 || errCode > UTF_ERR_MAX
throw(ArgumentError("Invalid error code for Unicode error: $errCode, Pos = $errPos, Char = $errChar"))
end
throw(ArgumentError(replace(replace(errMsgs[errCode],"<<1>>",string(errPos)),"<<2>>",hex(errChar))))
-1
end
is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800)
is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00)
is_surrogate_char(c::Unsigned) = ((c & ~0x007ff) == 0xd800)
is_valid_continuation(c) = ((c & 0xc0) == 0x80)
const UTF_NO_LONG_NULL = 1 # don't accept 0xc0 0x80 for '\0'
const UTF_NO_SURROGATES = 2 # don't accept surrogate pairs in UTF-8/UTF-32
const UTF_ACCEPT_LONG = 4 # accept long encodings (other than long null in UTF-8)
const UTF_LONG = 1 # Long encodings are present
const UTF_LATIN1 = 2 # characters in range 0x80-0xFF present
const UTF_UNICODE2 = 4 # characters in range 0x100-0x7ff present
const UTF_UNICODE3 = 8 # characters in range 0x800-0xd7ff, 0xe000-0xffff
const UTF_UNICODE4 = 16 # non-BMP characters present
const UTF_SURROGATE = 32 # surrogate pairs present
# Get a UTF-8 continuation byte, give error if invalid, and update position and character value
macro get_continuation!(ch, byt, str, pos)
quote
$(esc(byt)) = $(esc(str))[$(esc(pos)) += 1]
!is_valid_continuation($(esc(byt))) && utf_errfunc(UTF_ERR_CONT, $(esc(pos)), $(esc(byt)))
$(esc(ch)) = ($(esc(ch)) << 6) | ($(esc(byt)) & 0x3f)
end
end
# Check a non-ASCII character to find out what type it is
# 0x80-0xff -> LATIN1
# 0x100-0x7ff -> UNICODE2
# 0x800-0xd7ff,0xe000-0xffff -> UNICODE3
# 0xd800-0xdfff -> SURROGATE
# 0x10000-0x10ffff -> UNICODE4
immutable UTF_String_Counts
cntT::Int # total # of characters
cnt2::Int # number of characters in the range 0x80:0x7ff (2-bytes in UTF-8)
cnt3::Int # number of characters in the range 0x800:0xd7ff,0xe000:0xffff (3-bytes)
cnt4::Int # number of characters in the range 0x10000:0x10ffff) (4-bytes)
flags::Int
end
#=
@doc """
@brief Validates and calculates number of characters in a string
@param[in] str Vector of UInt8
@param[in] options flags to determine error handling (default 0)
@return (total characters, 2-byte, 3-byte, 4-byte, flags)
@throws ArgumentError
""" ->
=#
function check_string_utf8(str::Vector{UInt8}, options::Integer=0)
local byt::UInt8
local ch::UInt32
local cntT::Int=0, cnt2::Int=0, cnt3::Int=0, cnt4::Int=0, flags::Int=0
pos = 0
len = sizeof(str)
@inbounds while pos < len
ch = str[pos += 1]
cntT += 1
if ch > 0x7f
# Check UTF-8 encoding
if ch < 0xe0
# 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff)
(pos == len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
ch &= 0x3f
@get_continuation!(ch, byt, str, pos)
if ch > 0x7f
cnt2 += 1
flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1
elseif (options & UTF_ACCEPT_LONG) != 0
flags |= UTF_LONG
elseif (ch == 0) && ((options & UTF_NO_LONG_NULL) == 0)
flags |= UTF_LONG
else
utf_errfunc(UTF_ERR_LONG, pos, ch)
end
elseif ch < 0xf0
# 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff)
(pos + 2 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
ch &= 0x0f
@get_continuation!(ch, byt, str, pos)
@get_continuation!(ch, byt, str, pos)
# check for surrogate pairs, make sure correct
if is_surrogate_char(ch)
!is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, pos-2, ch)
# next character *must* be a trailing surrogate character
(pos + 3 > len) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos-2, ch)
byt = str[pos += 1] ; (byt != 0xed) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, byt)
surr::UInt32 = 0xd
@get_continuation!(surr, byt, str, pos)
@get_continuation!(surr, byt, str, pos)
!is_surrogate_trail(surr) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos-2, surr)
(options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos-2, surr)
flags |= UTF_SURROGATE
cnt4 += 1
elseif ch > 0x07ff
cnt3 += 1
elseif (options & UTF_ACCEPT_LONG) != 0
flags |= UTF_LONG
cnt2 += 1
else
utf_errfunc(UTF_ERR_LONG, pos-2, ch)
end
elseif ch < 0xf5
# 4-byte UTF-8 sequence (i.e. characters > 0xffff)
(pos + 3 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
ch &= 0x07
@get_continuation!(ch, byt, str, pos)
@get_continuation!(ch, byt, str, pos)
@get_continuation!(ch, byt, str, pos)
if ch > 0x10ffff
utf_errfunc(UTF_ERR_INVALID, pos-3, ch)
elseif ch > 0xffff
cnt4 += 1
elseif is_surrogate_char(ch)
utf_errfunc(UTF_ERR_SURROGATE, pos-3, ch)
elseif (options & UTF_ACCEPT_LONG) != 0
# This is an overly long encode character
flags |= UTF_LONG
if ch > 0x7ff
cnt3 += 1
elseif ch > 0x7f
cnt2 += 1
end
else
utf_errfunc(UTF_ERR_LONG, pos-2, ch)
end
else
utf_errfunc(UTF_ERR_INVALID, pos, ch)
end
end
end
return UTF_String_Counts(cntT, cnt2, cnt3, cnt4, flags | (cnt3 == 0 ? 0 : UTF_UNICODE3) | (cnt4 == 0 ? 0 : UTF_UNICODE4))
end
#=
@doc """
@brief Validates and calculates number of characters in a UTF-16 string
@param[in] str Vector{UInt16}
@param[in] options flags to determine error handling (default 0)
@return (total characters, 2-byte, 3-byte, 4-byte, flags)
@throws ArgumentError
""" ->
=#
function check_string_utf16(str::Vector{UInt16}, len::Int)
local ch::UInt32
local cntT::Int=0, cnt2::Int=0, cnt3::Int=0, cnt4::Int=0, flags::Int=0
local pos::Int = 0
@inbounds while pos < len
ch = str[pos += 1]
cntT += 1
if ch > 0x7f
if ch < 0x100
cnt2 += 1
flags |= UTF_LATIN1
elseif ch < 0x800
cnt2 += 1
flags |= UTF_UNICODE2
elseif !is_surrogate_char(ch)
cnt3 += 1
elseif is_surrogate_lead(ch)
pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
# next character *must* be a trailing surrogate character
ch = str[pos += 1]
!is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
cnt4 += 1
else
utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
end
end
end
return UTF_String_Counts(cntT, cnt2, cnt3, cnt4, flags | (cnt3 == 0 ? 0 : UTF_UNICODE3) | (cnt4 == 0 ? 0 : UTF_UNICODE4))
end
#=
@doc """
@brief Validates and calculates number of characters in a UTF-32 string
@param[in] str Union(Vector{UInt32},AbstractString)
@param[in] options flags to determine error handling (default 0)
@return (total characters, 2-byte, 3-byte, 4-byte, flags)
@throws ArgumentError
""" ->
=#
function check_string_utf32(str::Vector{UInt32}, len::Int, options::Integer=0)
local ch::UInt32
local cntT::Int=0, cnt2::Int=0, cnt3::Int=0, cnt4::Int=0, flags::Int=0
local pos::Int = 0
@inbounds while pos < len
ch = str[pos += 1]
cntT += 1
if ch > 0x7f
if ch < 0x100
cnt2 += 1
flags |= UTF_LATIN1
elseif ch < 0x800
cnt2 += 1
flags |= UTF_UNICODE2
elseif ch > 0xffff
(ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch)
cnt4 += 1
elseif !is_surrogate_char(ch)
cnt3 += 1
elseif is_surrogate_lead(ch)
pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
# next character *must* be a trailing surrogate character
ch = str[pos += 1]
!is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
cnt4 += 1
(options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch)
flags |= UTF_SURROGATE
else
utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
end
end
end
return UTF_String_Counts(cntT, cnt2, cnt3, cnt4, flags | (cnt3 == 0 ? 0 : UTF_UNICODE3) | (cnt4 == 0 ? 0 : UTF_UNICODE4))
end
function check_string_abs(str::AbstractString, options::Integer=0)
local ch::UInt32
local cntT::Int=0, cnt2::Int=0, cnt3::Int=0, cnt4::Int=0, flags::Int=0
local pos::Int = start(str)
local len::Int = endof(str)
@inbounds while pos < len
ch, pos = next(str, pos)
cntT += 1
if ch > 0x7f
if ch < 0x100
cnt2 += 1
flags |= UTF_LATIN1
elseif ch < 0x800
cnt2 += 1
flags |= UTF_UNICODE2
elseif ch > 0xffff
(ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch)
cnt4 += 1
elseif !is_surrogate_char(ch)
cnt3 += 1
elseif is_surrogate_lead(ch)
pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
# next character *must* be a trailing surrogate character
ch, pos = next(str, pos)
!is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
cnt4 += 1
(options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch)
flags |= UTF_SURROGATE
else
utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
end
end
end
return UTF_String_Counts(cntT, cnt2, cnt3, cnt4, flags | (cnt3 == 0 ? 0 : UTF_UNICODE3) | (cnt4 == 0 ? 0 : UTF_UNICODE4))
end
function check_string{T<:Union(Vector{UInt16},Vector{UInt32},Vector{Char},AbstractString)}(str::T, pos::Int, len::Int, options::Integer=0)
local ch::UInt32
local cntT::Int=0, cnt2::Int=0, cnt3::Int=0, cnt4::Int=0, flags::Int=0
@inbounds while pos < len
if T == AbstractString
ch, pos = next(str, pos)
else
ch = str[pos += 1]
end
cntT += 1
if ch > 0x7f
if ch < 0x100
cnt2 += 1
flags |= UTF_LATIN1
elseif ch < 0x800
cnt2 += 1
flags |= UTF_UNICODE2
elseif T != Vector{UInt16} && ch > 0xffff
(ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch)
cnt4 += 1
elseif !is_surrogate_char(ch)
cnt3 += 1
elseif is_surrogate_lead(ch)
pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
# next character *must* be a trailing surrogate character
if T == AbstractString
ch, pos = next(str, pos)
else
ch = str[pos += 1]
end
!is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
cnt4 += 1
if T != Vector{UInt16}
(options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch)
end
flags |= UTF_SURROGATE
else
utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
end
end
end
return UTF_String_Counts(cntT, cnt2, cnt3, cnt4, flags | (cnt3 == 0 ? 0 : UTF_UNICODE3) | (cnt4 == 0 ? 0 : UTF_UNICODE4))
end
function chkutf16(str::UTF16String, max::Int)
local out
local dat = str.data
for i=1:max
out = check_string(dat, 0, sizeof(dat)>>>1)
end
out
end
function chkutf32(str::UTF32String, max::Int)
local out
local dat = str.data
for i=1:max
out = check_string(dat, 0, sizeof(dat)>>>2)
end
out
end
function chkstr8(str::UTF8String, max::Int)
local out
local dat = str.data
for i=1:max
out = check_string_utf8(dat)
end
out
end
function chkstr16(str::UTF16String, max::Int)
local out
local dat = str.data
for i=1:max
out = check_string_utf16(dat, sizeof(dat)>>>1)
end
out
end
function chkstr32(str::UTF32String, max::Int)
local out
local dat = reinterpret(UInt32, str.data)
for i=1:max
out = check_string_utf32(dat, sizeof(dat)>>>2)
end
out
end
function tstchk(n::Int, strUTF8::UTF8String, strUTF16::UTF16String, strUTF32::UTF32String)
print("Check UTF-8 ")
@time chkstr8(strUTF8,n)
print("Check UTF-16 ")
@time chkstr16(strUTF16,n)
print("Check UTF-32 ")
@time chkstr32(strUTF32,n)
print("Generic UTF-16 ")
@time chkutf16(strUTF16,n)
print("Generic UTF-32 ")
@time chkutf32(strUTF32,n)
end
function tstall(str::String, n::Int, strUTF8::UTF8String, strUTF16::UTF16String, strUTF32::UTF32String)
println("\n\n$str: Looping $n times, length=$(length(strUTF32))")
println("UTF-8: $(sizeof(strUTF8)), UTF-16: $(sizeof(strUTF16)), UTF-32: $(sizeof(strUTF32))\n")
tstchk(n, strUTF8, strUTF16, strUTF32)
end
function tstsiz(n,strAscii,strA_UTF8,strL_UTF8,str2_UTF8,str3_UTF8,str4_UTF8,strS_UTF8)
strA_UTF16 = utf16(strA_UTF8)
strS_UTF16 = utf16(strS_UTF8)
strA_UTF32 = utf32(strA_UTF8)
strS_UTF32 = utf32(strS_UTF8)
tstall("ASCII:",n,strA_UTF8,strA_UTF16,strA_UTF32)
tstall("Surrogates:",n,strS_UTF8,strS_UTF16,strS_UTF32)
end
export dotest
function dotest(n)
# Create some ASCII, UTF8, UTF16, and UTF32 strings
baseascii = "abcdefghijklmnop\uff"
binstr = b"abcdefghijk\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xed\xa0\x80\xed\xb0\x80\xed\xaf\xbf\xed\xbf\xbf"
strAscii = "abcdefghijklmnop"
strA_UTF8 = baseascii[1:16]
strL_UTF8 = "abcdefghijk\uff\uff\uff\uff\uff"
str2_UTF8 = "abcdefghijk\uff\uff\uff\u7ff\u7ff"
str3_UTF8 = "abcdefghijk\uff\uff\uff\u7fff\u7fff"
str4_UTF8 = "abcdefghijk\uff\u7ff\u7fff\U7ffff\U0fffff"
strAscii ^= 262144
strA_UTF8 ^= 262144
strL_UTF8 ^= 262144
str2_UTF8 ^= 262144
str3_UTF8 ^= 262144
str4_UTF8 ^= 262144
for i=1:9
binstr = vcat(binstr,binstr,binstr,binstr)
end
tstsiz(n,strAscii,strA_UTF8,strL_UTF8,str2_UTF8,str3_UTF8,str4_UTF8,UTF8String(binstr))
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment