ScottPJones/chkutf.jl

## chkutf.jl
# This file is a part of Julia. License is MIT: http://julialang.org/license

module CheckUTF
if VERSION < v"0.4-"
    typealias AbstractString String
    typealias UInt Uint
    typealias UInt8 Uint8
    typealias UInt16 Uint16
    typealias UInt32 Uint32
end
#=
@doc """
@brief      Error messages for Unicode / UTF support
""" ->
=#

const UTF_ERR_SHORT = 1
const UTF_ERR_CONT  = 2
const UTF_ERR_LONG  = 3
const UTF_ERR_NOT_LEAD = 4
const UTF_ERR_NOT_TRAIL = 5
const UTF_ERR_NOT_SURROGATE = 6
const UTF_ERR_MISSING_SURROGATE = 7
const UTF_ERR_INVALID = 8
const UTF_ERR_SURROGATE = 9
const UTF_ERR_NULL_16_TERMINATE = 10
const UTF_ERR_NULL_32_TERMINATE = 11
const UTF_ERR_MAX = 11

const errMsgs = [
    "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)",
    "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)",
    "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)",
    "not a leading Unicode surrogate character at index <<1>> (0x<<2>>)",
    "not a trailing Unicode surrogate character at index <<1>> (0x<<2>>)",
    "not a valid Unicode surrogate character at index <<1>> (0x<<2>>",
    "missing trailing Unicode surrogate character after index <<1>> (0x<<2>>)",
    "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)",
    "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)",
    "UTF16String data must be NULL-terminated",
    "UTF32String data must be NULL-terminated"
]

#=
@doc """
@brief      Throws ArgumentError with information about the specific error, location, and character

@param[in]  errCode::UTF_ERR
@param[in]  errPos:: Integer
@param[in]  errChar::Integer

@throws never returns, always throws ArgumentError
""" ->
=#
function utf_errfunc(errCode::Integer, errPos::Integer, errChar::Integer)
    if errCode < 1 || errCode > UTF_ERR_MAX
        throw(ArgumentError("Invalid error code for Unicode error: $errCode, Pos = $errPos, Char = $errChar"))
    end
    throw(ArgumentError(replace(replace(errMsgs[errCode],"<<1>>",string(errPos)),"<<2>>",hex(errChar))))
    -1
end

is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800)
is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00)
is_surrogate_char(c::Unsigned) = ((c & ~0x007ff) == 0xd800)
is_valid_continuation(c) = ((c & 0xc0) == 0x80)

const UTF_NO_LONG_NULL = 1      # don't accept 0xc0 0x80 for '\0'
const UTF_NO_SURROGATES = 2     # don't accept surrogate pairs in UTF-8/UTF-32
const UTF_ACCEPT_LONG = 4       # accept long encodings (other than long null in UTF-8)

const UTF_LONG = 1              # Long encodings are present
const UTF_LATIN1 = 2            # characters in range 0x80-0xFF present
const UTF_UNICODE2 = 4          # characters in range 0x100-0x7ff present
const UTF_UNICODE3 = 8          # characters in range 0x800-0xd7ff, 0xe000-0xffff
const UTF_UNICODE4 = 16         # non-BMP characters present
const UTF_SURROGATE = 32        # surrogate pairs present

# Get a UTF-8 continuation byte, give error if invalid, and update position and character value
macro get_continuation!(ch, byt, str, pos)
    quote
        $(esc(byt)) = $(esc(str))[$(esc(pos)) += 1]
        !is_valid_continuation($(esc(byt))) && utf_errfunc(UTF_ERR_CONT, $(esc(pos)), $(esc(byt)))
        $(esc(ch)) = ($(esc(ch)) << 6) | ($(esc(byt)) & 0x3f)
    end
end

# Check a non-ASCII character to find out what type it is
# 0x80-0xff -> LATIN1
# 0x100-0x7ff -> UNICODE2
# 0x800-0xd7ff,0xe000-0xffff -> UNICODE3
# 0xd800-0xdfff -> SURROGATE
# 0x10000-0x10ffff -> UNICODE4

immutable UTF_String_Counts
    cntT::Int      # total # of characters
    cnt2::Int      # number of characters in the range 0x80:0x7ff (2-bytes in UTF-8)
    cnt3::Int      # number of characters in the range 0x800:0xd7ff,0xe000:0xffff (3-bytes)
    cnt4::Int      # number of characters in the range 0x10000:0x10ffff) (4-bytes)
    flags::Int
end

#=
@doc """
@brief      Validates and calculates number of characters in a string

@param[in]  str     Vector of UInt8
@param[in]  options flags to determine error handling (default 0)

@return     (total characters, 2-byte, 3-byte, 4-byte, flags)
@throws     ArgumentError
""" ->
=#
function check_string_utf8(str::Vector{UInt8}, options::Integer=0)
    local byt::UInt8
    local ch::UInt32
    local cntT::Int=0, cnt2::Int=0, cnt3::Int=0, cnt4::Int=0, flags::Int=0
    pos = 0
    len = sizeof(str)
    @inbounds while pos < len
        ch = str[pos += 1]
        cntT += 1
        if ch > 0x7f
            # Check UTF-8 encoding
            if ch < 0xe0
                # 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff)
                (pos == len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
                ch &= 0x3f
                @get_continuation!(ch, byt, str, pos)
                if ch > 0x7f
                    cnt2 += 1
                    flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1
                elseif (options & UTF_ACCEPT_LONG) != 0
                    flags |= UTF_LONG
                elseif (ch == 0) && ((options & UTF_NO_LONG_NULL) == 0)
                    flags |= UTF_LONG
                else
                    utf_errfunc(UTF_ERR_LONG, pos, ch)
                end
             elseif ch < 0xf0
                # 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff)
                (pos + 2 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
                ch &= 0x0f
                @get_continuation!(ch, byt, str, pos)
                @get_continuation!(ch, byt, str, pos)
                # check for surrogate pairs, make sure correct
                if is_surrogate_char(ch)
                    !is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, pos-2, ch)
                    # next character *must* be a trailing surrogate character
                    (pos + 3 > len) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos-2, ch)
                    byt = str[pos += 1] ; (byt != 0xed) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, byt)
                    surr::UInt32 = 0xd
                    @get_continuation!(surr, byt, str, pos)
                    @get_continuation!(surr, byt, str, pos)
                    !is_surrogate_trail(surr) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos-2, surr)
                    (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos-2, surr)
                    flags |= UTF_SURROGATE
                    cnt4 += 1
                elseif ch > 0x07ff
                    cnt3 += 1
                elseif (options & UTF_ACCEPT_LONG) != 0
                    flags |= UTF_LONG
                    cnt2 += 1
                else
                    utf_errfunc(UTF_ERR_LONG, pos-2, ch)
                end
            elseif ch < 0xf5
                # 4-byte UTF-8 sequence (i.e. characters > 0xffff)
                (pos + 3 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
                ch &= 0x07
                @get_continuation!(ch, byt, str, pos)
                @get_continuation!(ch, byt, str, pos)
                @get_continuation!(ch, byt, str, pos)
                if ch > 0x10ffff
                    utf_errfunc(UTF_ERR_INVALID, pos-3, ch)
                elseif ch > 0xffff
                    cnt4 += 1
                elseif is_surrogate_char(ch)
                    utf_errfunc(UTF_ERR_SURROGATE, pos-3, ch)
                elseif (options & UTF_ACCEPT_LONG) != 0
                    # This is an overly long encode character
                    flags |= UTF_LONG
                    if ch > 0x7ff
                        cnt3 += 1
                    elseif ch > 0x7f
                        cnt2 += 1
                    end
                else
                    utf_errfunc(UTF_ERR_LONG, pos-2, ch)
                end
            else
                utf_errfunc(UTF_ERR_INVALID, pos, ch)
            end
        end
    end
    return UTF_String_Counts(cntT, cnt2, cnt3, cnt4, flags | (cnt3 == 0 ? 0 : UTF_UNICODE3) | (cnt4 == 0 ? 0 : UTF_UNICODE4))
end

#=
@doc """
@brief      Validates and calculates number of characters in a UTF-16 string

@param[in]  str     Vector{UInt16}
@param[in]  options flags to determine error handling (default 0)

@return     (total characters, 2-byte, 3-byte, 4-byte, flags)
@throws     ArgumentError
""" ->
=#
function check_string_utf16(str::Vector{UInt16}, len::Int)
    local ch::UInt32
    local cntT::Int=0, cnt2::Int=0, cnt3::Int=0, cnt4::Int=0, flags::Int=0
    local pos::Int = 0
    @inbounds while pos < len
        ch = str[pos += 1]
        cntT += 1
        if ch > 0x7f
            if ch < 0x100
                cnt2 += 1
                flags |= UTF_LATIN1
            elseif ch < 0x800
                cnt2 += 1
                flags |= UTF_UNICODE2
            elseif !is_surrogate_char(ch)
                cnt3 += 1
            elseif is_surrogate_lead(ch)
                pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
                # next character *must* be a trailing surrogate character
                ch = str[pos += 1]
                !is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
                cnt4 += 1
            else
                utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
            end
        end
    end
    return UTF_String_Counts(cntT, cnt2, cnt3, cnt4, flags | (cnt3 == 0 ? 0 : UTF_UNICODE3) | (cnt4 == 0 ? 0 : UTF_UNICODE4))
end

#=
@doc """
@brief      Validates and calculates number of characters in a UTF-32 string

@param[in]  str     Union(Vector{UInt32},AbstractString)
@param[in]  options flags to determine error handling (default 0)

@return     (total characters, 2-byte, 3-byte, 4-byte, flags)
@throws     ArgumentError
""" ->
=#
function check_string_utf32(str::Vector{UInt32}, len::Int, options::Integer=0)
    local ch::UInt32
    local cntT::Int=0, cnt2::Int=0, cnt3::Int=0, cnt4::Int=0, flags::Int=0
    local pos::Int = 0
    @inbounds while pos < len
        ch = str[pos += 1]
        cntT += 1
        if ch > 0x7f
            if ch < 0x100
                cnt2 += 1
                flags |= UTF_LATIN1
            elseif ch < 0x800
                cnt2 += 1
                flags |= UTF_UNICODE2
            elseif ch > 0xffff
                (ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch)
                cnt4 += 1
            elseif !is_surrogate_char(ch)
                cnt3 += 1
            elseif is_surrogate_lead(ch)
                pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
                # next character *must* be a trailing surrogate character
                ch = str[pos += 1]
                !is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
                cnt4 += 1
                (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch)
                flags |= UTF_SURROGATE
            else
                utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
            end
        end
    end
    return UTF_String_Counts(cntT, cnt2, cnt3, cnt4, flags | (cnt3 == 0 ? 0 : UTF_UNICODE3) | (cnt4 == 0 ? 0 : UTF_UNICODE4))
end

function check_string_abs(str::AbstractString, options::Integer=0)
    local ch::UInt32
    local cntT::Int=0, cnt2::Int=0, cnt3::Int=0, cnt4::Int=0, flags::Int=0
    local pos::Int = start(str)
    local len::Int = endof(str)
    @inbounds while pos < len
        ch, pos = next(str, pos)
        cntT += 1
        if ch > 0x7f
            if ch < 0x100
                cnt2 += 1
                flags |= UTF_LATIN1
            elseif ch < 0x800
                cnt2 += 1
                flags |= UTF_UNICODE2
            elseif ch > 0xffff
                (ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch)
                cnt4 += 1
            elseif !is_surrogate_char(ch)
                cnt3 += 1
            elseif is_surrogate_lead(ch)
                pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
                # next character *must* be a trailing surrogate character
                ch, pos = next(str, pos)
                !is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
                cnt4 += 1
                (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch)
                flags |= UTF_SURROGATE
            else
                utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
            end
        end
    end
    return UTF_String_Counts(cntT, cnt2, cnt3, cnt4, flags | (cnt3 == 0 ? 0 : UTF_UNICODE3) | (cnt4 == 0 ? 0 : UTF_UNICODE4))
end

function check_string{T<:Union(Vector{UInt16},Vector{UInt32},Vector{Char},AbstractString)}(str::T, pos::Int, len::Int, options::Integer=0)
    local ch::UInt32
    local cntT::Int=0, cnt2::Int=0, cnt3::Int=0, cnt4::Int=0, flags::Int=0
    @inbounds while pos < len
        if T == AbstractString
            ch, pos = next(str, pos)
        else
            ch = str[pos += 1]
        end
        cntT += 1
        if ch > 0x7f
            if ch < 0x100
                cnt2 += 1
                flags |= UTF_LATIN1
            elseif ch < 0x800
                cnt2 += 1
                flags |= UTF_UNICODE2
            elseif T != Vector{UInt16} && ch > 0xffff
                (ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch)
                cnt4 += 1
            elseif !is_surrogate_char(ch)
                cnt3 += 1
            elseif is_surrogate_lead(ch)
                pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
                # next character *must* be a trailing surrogate character
                if T == AbstractString
                    ch, pos = next(str, pos)
                else
                    ch = str[pos += 1]
                end
                !is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
                cnt4 += 1
                if T != Vector{UInt16}
                    (options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch)
                end
                flags |= UTF_SURROGATE
            else
                utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
            end
        end
    end
    return UTF_String_Counts(cntT, cnt2, cnt3, cnt4, flags | (cnt3 == 0 ? 0 : UTF_UNICODE3) | (cnt4 == 0 ? 0 : UTF_UNICODE4))
end

function chkutf16(str::UTF16String, max::Int)
    local out
    local dat = str.data
    for i=1:max
	out = check_string(dat, 0, sizeof(dat)>>>1)
    end
    out
end
function chkutf32(str::UTF32String, max::Int)
    local out
    local dat = str.data
    for i=1:max
	out = check_string(dat, 0, sizeof(dat)>>>2)
    end
    out
end
function chkstr8(str::UTF8String, max::Int)
    local out
    local dat = str.data
    for i=1:max
	out = check_string_utf8(dat)
    end
    out
end
function chkstr16(str::UTF16String, max::Int)
    local out
    local dat = str.data
    for i=1:max
	out = check_string_utf16(dat, sizeof(dat)>>>1)
    end
    out
end
function chkstr32(str::UTF32String, max::Int)
    local out
    local dat = reinterpret(UInt32, str.data)
    for i=1:max
	out = check_string_utf32(dat, sizeof(dat)>>>2)
    end
    out
end

function tstchk(n::Int, strUTF8::UTF8String, strUTF16::UTF16String, strUTF32::UTF32String)
        print("Check UTF-8    ")
        @time chkstr8(strUTF8,n)
        print("Check UTF-16   ")
        @time chkstr16(strUTF16,n)
        print("Check UTF-32   ")
        @time chkstr32(strUTF32,n)
        print("Generic UTF-16 ")
        @time chkutf16(strUTF16,n)
        print("Generic UTF-32 ")
        @time chkutf32(strUTF32,n)
end

function tstall(str::String, n::Int, strUTF8::UTF8String, strUTF16::UTF16String, strUTF32::UTF32String)
    println("\n\n$str: Looping $n times, length=$(length(strUTF32))")
    println("UTF-8: $(sizeof(strUTF8)), UTF-16: $(sizeof(strUTF16)), UTF-32: $(sizeof(strUTF32))\n")
    tstchk(n, strUTF8, strUTF16, strUTF32)
end

function tstsiz(n,strAscii,strA_UTF8,strL_UTF8,str2_UTF8,str3_UTF8,str4_UTF8,strS_UTF8)
    strA_UTF16 = utf16(strA_UTF8)
    strS_UTF16 = utf16(strS_UTF8)
    strA_UTF32 = utf32(strA_UTF8)
    strS_UTF32 = utf32(strS_UTF8)
    tstall("ASCII:",n,strA_UTF8,strA_UTF16,strA_UTF32)
    tstall("Surrogates:",n,strS_UTF8,strS_UTF16,strS_UTF32)
end

export dotest
function dotest(n)
    # Create some ASCII, UTF8, UTF16, and UTF32 strings
    baseascii = "abcdefghijklmnop\uff"
    binstr    = b"abcdefghijk\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xed\xa0\x80\xed\xb0\x80\xed\xaf\xbf\xed\xbf\xbf"
    strAscii = "abcdefghijklmnop"
    strA_UTF8 = baseascii[1:16]
    strL_UTF8 = "abcdefghijk\uff\uff\uff\uff\uff"
    str2_UTF8 = "abcdefghijk\uff\uff\uff\u7ff\u7ff"
    str3_UTF8 = "abcdefghijk\uff\uff\uff\u7fff\u7fff"
    str4_UTF8 = "abcdefghijk\uff\u7ff\u7fff\U7ffff\U0fffff"

    strAscii  ^= 262144
    strA_UTF8 ^= 262144
    strL_UTF8 ^= 262144
    str2_UTF8 ^= 262144
    str3_UTF8 ^= 262144
    str4_UTF8 ^= 262144
    for i=1:9
        binstr    = vcat(binstr,binstr,binstr,binstr)
    end
    tstsiz(n,strAscii,strA_UTF8,strL_UTF8,str2_UTF8,str3_UTF8,str4_UTF8,UTF8String(binstr))
end
end
	# This file is a part of Julia. License is MIT: http://julialang.org/license

	module CheckUTF
	if VERSION < v"0.4-"
	typealias AbstractString String
	typealias UInt Uint
	typealias UInt8 Uint8
	typealias UInt16 Uint16
	typealias UInt32 Uint32
	end
	#=
	@doc """
	@brief Error messages for Unicode / UTF support
	""" ->
	=#

	const UTF_ERR_SHORT = 1
	const UTF_ERR_CONT = 2
	const UTF_ERR_LONG = 3
	const UTF_ERR_NOT_LEAD = 4
	const UTF_ERR_NOT_TRAIL = 5
	const UTF_ERR_NOT_SURROGATE = 6
	const UTF_ERR_MISSING_SURROGATE = 7
	const UTF_ERR_INVALID = 8
	const UTF_ERR_SURROGATE = 9
	const UTF_ERR_NULL_16_TERMINATE = 10
	const UTF_ERR_NULL_32_TERMINATE = 11
	const UTF_ERR_MAX = 11

	const errMsgs = [
	"invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)",
	"invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)",
	"invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)",
	"not a leading Unicode surrogate character at index <<1>> (0x<<2>>)",
	"not a trailing Unicode surrogate character at index <<1>> (0x<<2>>)",
	"not a valid Unicode surrogate character at index <<1>> (0x<<2>>",
	"missing trailing Unicode surrogate character after index <<1>> (0x<<2>>)",
	"invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)",
	"surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)",
	"UTF16String data must be NULL-terminated",
	"UTF32String data must be NULL-terminated"
	]

	#=
	@doc """
	@brief Throws ArgumentError with information about the specific error, location, and character

	@param[in] errCode::UTF_ERR
	@param[in] errPos:: Integer
	@param[in] errChar::Integer

	@throws never returns, always throws ArgumentError
	""" ->
	=#
	function utf_errfunc(errCode::Integer, errPos::Integer, errChar::Integer)
	if errCode < 1 \|\| errCode > UTF_ERR_MAX
	throw(ArgumentError("Invalid error code for Unicode error: $errCode, Pos = $errPos, Char = $errChar"))
	end
	throw(ArgumentError(replace(replace(errMsgs[errCode],"<<1>>",string(errPos)),"<<2>>",hex(errChar))))
	-1
	end

	is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800)
	is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00)
	is_surrogate_char(c::Unsigned) = ((c & ~0x007ff) == 0xd800)
	is_valid_continuation(c) = ((c & 0xc0) == 0x80)

	const UTF_NO_LONG_NULL = 1 # don't accept 0xc0 0x80 for '\0'
	const UTF_NO_SURROGATES = 2 # don't accept surrogate pairs in UTF-8/UTF-32
	const UTF_ACCEPT_LONG = 4 # accept long encodings (other than long null in UTF-8)

	const UTF_LONG = 1 # Long encodings are present
	const UTF_LATIN1 = 2 # characters in range 0x80-0xFF present
	const UTF_UNICODE2 = 4 # characters in range 0x100-0x7ff present
	const UTF_UNICODE3 = 8 # characters in range 0x800-0xd7ff, 0xe000-0xffff
	const UTF_UNICODE4 = 16 # non-BMP characters present
	const UTF_SURROGATE = 32 # surrogate pairs present

	# Get a UTF-8 continuation byte, give error if invalid, and update position and character value
	macro get_continuation!(ch, byt, str, pos)
	quote
	$(esc(byt)) = $(esc(str))[$(esc(pos)) += 1]
	!is_valid_continuation($(esc(byt))) && utf_errfunc(UTF_ERR_CONT, $(esc(pos)), $(esc(byt)))
	$(esc(ch)) = ($(esc(ch)) << 6) \| ($(esc(byt)) & 0x3f)
	end
	end

	# Check a non-ASCII character to find out what type it is
	# 0x80-0xff -> LATIN1
	# 0x100-0x7ff -> UNICODE2
	# 0x800-0xd7ff,0xe000-0xffff -> UNICODE3
	# 0xd800-0xdfff -> SURROGATE
	# 0x10000-0x10ffff -> UNICODE4

	immutable UTF_String_Counts
	cntT::Int # total # of characters
	cnt2::Int # number of characters in the range 0x80:0x7ff (2-bytes in UTF-8)
	cnt3::Int # number of characters in the range 0x800:0xd7ff,0xe000:0xffff (3-bytes)
	cnt4::Int # number of characters in the range 0x10000:0x10ffff) (4-bytes)
	flags::Int
	end

	#=
	@doc """
	@brief Validates and calculates number of characters in a string

	@param[in] str Vector of UInt8
	@param[in] options flags to determine error handling (default 0)

	@return (total characters, 2-byte, 3-byte, 4-byte, flags)
	@throws ArgumentError
	""" ->
	=#
	function check_string_utf8(str::Vector{UInt8}, options::Integer=0)
	local byt::UInt8
	local ch::UInt32
	local cntT::Int=0, cnt2::Int=0, cnt3::Int=0, cnt4::Int=0, flags::Int=0
	pos = 0
	len = sizeof(str)
	@inbounds while pos < len
	ch = str[pos += 1]
	cntT += 1
	if ch > 0x7f
	# Check UTF-8 encoding
	if ch < 0xe0
	# 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff)
	(pos == len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
	ch &= 0x3f
	@get_continuation!(ch, byt, str, pos)
	if ch > 0x7f
	cnt2 += 1
	flags \|= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1
	elseif (options & UTF_ACCEPT_LONG) != 0
	flags \|= UTF_LONG
	elseif (ch == 0) && ((options & UTF_NO_LONG_NULL) == 0)
	flags \|= UTF_LONG
	else
	utf_errfunc(UTF_ERR_LONG, pos, ch)
	end
	elseif ch < 0xf0
	# 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff)
	(pos + 2 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
	ch &= 0x0f
	@get_continuation!(ch, byt, str, pos)
	@get_continuation!(ch, byt, str, pos)
	# check for surrogate pairs, make sure correct
	if is_surrogate_char(ch)
	!is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, pos-2, ch)
	# next character must be a trailing surrogate character
	(pos + 3 > len) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos-2, ch)
	byt = str[pos += 1] ; (byt != 0xed) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, byt)
	surr::UInt32 = 0xd
	@get_continuation!(surr, byt, str, pos)
	@get_continuation!(surr, byt, str, pos)
	!is_surrogate_trail(surr) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos-2, surr)
	(options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos-2, surr)
	flags \|= UTF_SURROGATE
	cnt4 += 1
	elseif ch > 0x07ff
	cnt3 += 1
	elseif (options & UTF_ACCEPT_LONG) != 0
	flags \|= UTF_LONG
	cnt2 += 1
	else
	utf_errfunc(UTF_ERR_LONG, pos-2, ch)
	end
	elseif ch < 0xf5
	# 4-byte UTF-8 sequence (i.e. characters > 0xffff)
	(pos + 3 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
	ch &= 0x07
	@get_continuation!(ch, byt, str, pos)
	@get_continuation!(ch, byt, str, pos)
	@get_continuation!(ch, byt, str, pos)
	if ch > 0x10ffff
	utf_errfunc(UTF_ERR_INVALID, pos-3, ch)
	elseif ch > 0xffff
	cnt4 += 1
	elseif is_surrogate_char(ch)
	utf_errfunc(UTF_ERR_SURROGATE, pos-3, ch)
	elseif (options & UTF_ACCEPT_LONG) != 0
	# This is an overly long encode character
	flags \|= UTF_LONG
	if ch > 0x7ff
	cnt3 += 1
	elseif ch > 0x7f
	cnt2 += 1
	end
	else
	utf_errfunc(UTF_ERR_LONG, pos-2, ch)
	end
	else
	utf_errfunc(UTF_ERR_INVALID, pos, ch)
	end
	end
	end
	return UTF_String_Counts(cntT, cnt2, cnt3, cnt4, flags \| (cnt3 == 0 ? 0 : UTF_UNICODE3) \| (cnt4 == 0 ? 0 : UTF_UNICODE4))
	end

	#=
	@doc """
	@brief Validates and calculates number of characters in a UTF-16 string

	@param[in] str Vector{UInt16}
	@param[in] options flags to determine error handling (default 0)

	@return (total characters, 2-byte, 3-byte, 4-byte, flags)
	@throws ArgumentError
	""" ->
	=#
	function check_string_utf16(str::Vector{UInt16}, len::Int)
	local ch::UInt32
	local cntT::Int=0, cnt2::Int=0, cnt3::Int=0, cnt4::Int=0, flags::Int=0
	local pos::Int = 0
	@inbounds while pos < len
	ch = str[pos += 1]
	cntT += 1
	if ch > 0x7f
	if ch < 0x100
	cnt2 += 1
	flags \|= UTF_LATIN1
	elseif ch < 0x800
	cnt2 += 1
	flags \|= UTF_UNICODE2
	elseif !is_surrogate_char(ch)
	cnt3 += 1
	elseif is_surrogate_lead(ch)
	pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
	# next character must be a trailing surrogate character
	ch = str[pos += 1]
	!is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
	cnt4 += 1
	else
	utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
	end
	end
	end
	return UTF_String_Counts(cntT, cnt2, cnt3, cnt4, flags \| (cnt3 == 0 ? 0 : UTF_UNICODE3) \| (cnt4 == 0 ? 0 : UTF_UNICODE4))
	end

	#=
	@doc """
	@brief Validates and calculates number of characters in a UTF-32 string

	@param[in] str Union(Vector{UInt32},AbstractString)
	@param[in] options flags to determine error handling (default 0)

	@return (total characters, 2-byte, 3-byte, 4-byte, flags)
	@throws ArgumentError
	""" ->
	=#
	function check_string_utf32(str::Vector{UInt32}, len::Int, options::Integer=0)
	local ch::UInt32
	local cntT::Int=0, cnt2::Int=0, cnt3::Int=0, cnt4::Int=0, flags::Int=0
	local pos::Int = 0
	@inbounds while pos < len
	ch = str[pos += 1]
	cntT += 1
	if ch > 0x7f
	if ch < 0x100
	cnt2 += 1
	flags \|= UTF_LATIN1
	elseif ch < 0x800
	cnt2 += 1
	flags \|= UTF_UNICODE2
	elseif ch > 0xffff
	(ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch)
	cnt4 += 1
	elseif !is_surrogate_char(ch)
	cnt3 += 1
	elseif is_surrogate_lead(ch)
	pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
	# next character must be a trailing surrogate character
	ch = str[pos += 1]
	!is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
	cnt4 += 1
	(options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch)
	flags \|= UTF_SURROGATE
	else
	utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
	end
	end
	end
	return UTF_String_Counts(cntT, cnt2, cnt3, cnt4, flags \| (cnt3 == 0 ? 0 : UTF_UNICODE3) \| (cnt4 == 0 ? 0 : UTF_UNICODE4))
	end

	function check_string_abs(str::AbstractString, options::Integer=0)
	local ch::UInt32
	local cntT::Int=0, cnt2::Int=0, cnt3::Int=0, cnt4::Int=0, flags::Int=0
	local pos::Int = start(str)
	local len::Int = endof(str)
	@inbounds while pos < len
	ch, pos = next(str, pos)
	cntT += 1
	if ch > 0x7f
	if ch < 0x100
	cnt2 += 1
	flags \|= UTF_LATIN1
	elseif ch < 0x800
	cnt2 += 1
	flags \|= UTF_UNICODE2
	elseif ch > 0xffff
	(ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch)
	cnt4 += 1
	elseif !is_surrogate_char(ch)
	cnt3 += 1
	elseif is_surrogate_lead(ch)
	pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
	# next character must be a trailing surrogate character
	ch, pos = next(str, pos)
	!is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
	cnt4 += 1
	(options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch)
	flags \|= UTF_SURROGATE
	else
	utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
	end
	end
	end
	return UTF_String_Counts(cntT, cnt2, cnt3, cnt4, flags \| (cnt3 == 0 ? 0 : UTF_UNICODE3) \| (cnt4 == 0 ? 0 : UTF_UNICODE4))
	end

	function check_string{T<:Union(Vector{UInt16},Vector{UInt32},Vector{Char},AbstractString)}(str::T, pos::Int, len::Int, options::Integer=0)
	local ch::UInt32
	local cntT::Int=0, cnt2::Int=0, cnt3::Int=0, cnt4::Int=0, flags::Int=0
	@inbounds while pos < len
	if T == AbstractString
	ch, pos = next(str, pos)
	else
	ch = str[pos += 1]
	end
	cntT += 1
	if ch > 0x7f
	if ch < 0x100
	cnt2 += 1
	flags \|= UTF_LATIN1
	elseif ch < 0x800
	cnt2 += 1
	flags \|= UTF_UNICODE2
	elseif T != Vector{UInt16} && ch > 0xffff
	(ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch)
	cnt4 += 1
	elseif !is_surrogate_char(ch)
	cnt3 += 1
	elseif is_surrogate_lead(ch)
	pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
	# next character must be a trailing surrogate character
	if T == AbstractString
	ch, pos = next(str, pos)
	else
	ch = str[pos += 1]
	end
	!is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
	cnt4 += 1
	if T != Vector{UInt16}
	(options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch)
	end
	flags \|= UTF_SURROGATE
	else
	utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
	end
	end
	end
	return UTF_String_Counts(cntT, cnt2, cnt3, cnt4, flags \| (cnt3 == 0 ? 0 : UTF_UNICODE3) \| (cnt4 == 0 ? 0 : UTF_UNICODE4))
	end

	function chkutf16(str::UTF16String, max::Int)
	local out
	local dat = str.data
	for i=1:max
	out = check_string(dat, 0, sizeof(dat)>>>1)
	end
	out
	end
	function chkutf32(str::UTF32String, max::Int)
	local out
	local dat = str.data
	for i=1:max
	out = check_string(dat, 0, sizeof(dat)>>>2)
	end
	out
	end
	function chkstr8(str::UTF8String, max::Int)
	local out
	local dat = str.data
	for i=1:max
	out = check_string_utf8(dat)
	end
	out
	end
	function chkstr16(str::UTF16String, max::Int)
	local out
	local dat = str.data
	for i=1:max
	out = check_string_utf16(dat, sizeof(dat)>>>1)
	end
	out
	end
	function chkstr32(str::UTF32String, max::Int)
	local out
	local dat = reinterpret(UInt32, str.data)
	for i=1:max
	out = check_string_utf32(dat, sizeof(dat)>>>2)
	end
	out
	end

	function tstchk(n::Int, strUTF8::UTF8String, strUTF16::UTF16String, strUTF32::UTF32String)
	print("Check UTF-8 ")
	@time chkstr8(strUTF8,n)
	print("Check UTF-16 ")
	@time chkstr16(strUTF16,n)
	print("Check UTF-32 ")
	@time chkstr32(strUTF32,n)
	print("Generic UTF-16 ")
	@time chkutf16(strUTF16,n)
	print("Generic UTF-32 ")
	@time chkutf32(strUTF32,n)
	end

	function tstall(str::String, n::Int, strUTF8::UTF8String, strUTF16::UTF16String, strUTF32::UTF32String)
	println("\n\n$str: Looping $n times, length=$(length(strUTF32))")
	println("UTF-8: $(sizeof(strUTF8)), UTF-16: $(sizeof(strUTF16)), UTF-32: $(sizeof(strUTF32))\n")
	tstchk(n, strUTF8, strUTF16, strUTF32)
	end

	function tstsiz(n,strAscii,strA_UTF8,strL_UTF8,str2_UTF8,str3_UTF8,str4_UTF8,strS_UTF8)
	strA_UTF16 = utf16(strA_UTF8)
	strS_UTF16 = utf16(strS_UTF8)
	strA_UTF32 = utf32(strA_UTF8)
	strS_UTF32 = utf32(strS_UTF8)
	tstall("ASCII:",n,strA_UTF8,strA_UTF16,strA_UTF32)
	tstall("Surrogates:",n,strS_UTF8,strS_UTF16,strS_UTF32)
	end

	export dotest
	function dotest(n)
	# Create some ASCII, UTF8, UTF16, and UTF32 strings
	baseascii = "abcdefghijklmnop\uff"
	binstr = b"abcdefghijk\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xed\xa0\x80\xed\xb0\x80\xed\xaf\xbf\xed\xbf\xbf"
	strAscii = "abcdefghijklmnop"
	strA_UTF8 = baseascii[1:16]
	strL_UTF8 = "abcdefghijk\uff\uff\uff\uff\uff"
	str2_UTF8 = "abcdefghijk\uff\uff\uff\u7ff\u7ff"
	str3_UTF8 = "abcdefghijk\uff\uff\uff\u7fff\u7fff"
	str4_UTF8 = "abcdefghijk\uff\u7ff\u7fff\U7ffff\U0fffff"

	strAscii ^= 262144
	strA_UTF8 ^= 262144
	strL_UTF8 ^= 262144
	str2_UTF8 ^= 262144
	str3_UTF8 ^= 262144
	str4_UTF8 ^= 262144
	for i=1:9
	binstr = vcat(binstr,binstr,binstr,binstr)
	end
	tstsiz(n,strAscii,strA_UTF8,strL_UTF8,str2_UTF8,str3_UTF8,str4_UTF8,UTF8String(binstr))
	end
	end