Skip to content

Instantly share code, notes, and snippets.

@quinnj
Created April 17, 2019 22:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save quinnj/712821c3718ac434af466d5775a238c6 to your computer and use it in GitHub Desktop.
Save quinnj/712821c3718ac434af466d5775a238c6 to your computer and use it in GitHub Desktop.
Argument type perf difference
module Foo
delimlen(delim::UInt8) = 1
delimlen(delim::Tuple{Ptr{UInt8}, Int}) = delim[2]
checkdelim(delim::UInt8, b, buf, pos, eof) = delim == b
@inline function checkdelim(delim::Tuple{Ptr{UInt8}, Int}, b, buf, pos, eof)
startptr = pointer(buf, pos)
if pos + delim[2] <= eof
match = memcmp(startptr, delim[1], delim[2])
if match
return true
end
end
return false
end
overflowval(::Type{T}) where {T <: Integer} = div(typemax(T) - T(9), T(10))
const ReturnCode = Int16
ok(x::ReturnCode) = x > 0
const SUCCESS = 0b0000000000000000 % ReturnCode
const INVALID = 0b1000000000000000 % ReturnCode
# success flags
const OK = 0b0000000000000001 % ReturnCode
const SENTINEL = 0b0000000000000010 % ReturnCode
# property flags
const QUOTED = 0b0000000000000100 % ReturnCode
const DELIMITED = 0b0000000000001000 % ReturnCode
const NEWLINE = 0b0000000000010000 % ReturnCode
const EOF = 0b0000000000100000 % ReturnCode
const ESCAPED_STRING = 0b0000001000000000 % ReturnCode
# invalid flags
const INVALID_QUOTED_FIELD = 0b1000000001000000 % ReturnCode
const INVALID_DELIMITER = 0b1000000010000000 % ReturnCode
const OVERFLOW = 0b1000000100000000 % ReturnCode
@inline function defaultparser(::Type{T}, buf, pos, eof, sentinel, wh1::UInt8, wh2::UInt8, oq::UInt8, cq::UInt8, e::UInt8, ignorerepeated::Bool, delim::Union{UInt8, Tuple{Ptr{UInt8}, Int}}) where {T <: Integer}
startpos = pos
code = SUCCESS
x = zero(T)
neg = false
quoted = false
sentinelpos = 0
if pos >= eof
code = (sentinel === missing ? SENTINEL : INVALID) | EOF
@goto donedone
end
@inbounds b = buf[pos]
# strip leading whitespace
while b == wh1 || b == wh2
pos += 1
if pos == eof
code = INVALID | EOF
@goto donedone
end
@inbounds b = buf[pos]
end
# check for start of quoted field
quoted = b == oq
if quoted
code = QUOTED
pos += 1
if pos == eof
code |= INVALID_QUOTED_FIELD
@goto donedone
end
@inbounds b = buf[pos]
# ignore whitespace within quoted field
while b == wh1 || b == wh2
pos += 1
if pos == eof
code |= INVALID_QUOTED_FIELD | EOF
@goto donedone
end
@inbounds b = buf[pos]
end
end
# check for sentinel values if applicable
if sentinel !== nothing && sentinel !== missing
startptr = pointer(buf, pos)
# sentinel is an iterable of Tuple{Ptr{UInt8}, Int}, sorted from longest sentinel string to shortest
for (ptr, len) in sentinel
if pos + len <= eof
match = memcmp(startptr, ptr, len)
if match
sentinelpos = pos + len
break
end
end
end
end
# start actual int parsing
neg = b == UInt8('-')
pos += neg || b == UInt8('+')
if pos == eof
# eof after stripping whitespace & maybe parsing '-' or '+'
if sentinel === missing && pos == startpos
# if we haven't moved any chars, then we count it as a zero-width sentinel match
code |= SENTINEL | EOF
elseif sentinel !== nothing && sentinel !== missing && sentinelpos > 0
# if we matched a sentinel value
pos = sentinelpos
code |= SENTINEL | EOF
else
# otherwise, it's just an invalid value
code |= INVALID | EOF
end
if quoted
# if we detected a quote character, it's an invalid quoted field due to eof in the middle
code |= INVALID_QUOTED_FIELD
end
@goto donedone
end
@inbounds b = buf[pos] - UInt8('0')
if b > 0x09
# character isn't a digit, check for sentinels, otherwise INVALID value
if sentinel === missing && pos == startpos
code |= SENTINEL
elseif sentinel !== nothing && sentinel !== missing && sentinelpos > 0
pos = sentinelpos
code |= SENTINEL
else
code |= INVALID
end
@goto donevalue
end
while true
x = T(10) * x + b
pos += 1
if pos == eof
x = ifelse(neg, -x, x)
# if we matched a sentinel and it's length is more than the # of digits we parsed
# we mark SENTINEL (>= length), otherwise mark it as OK value
if sentinel !== nothing && sentinel !== missing && sentinelpos >= pos
pos = sentinelpos
code |= SENTINEL | EOF
else
code |= OK | EOF
end
if quoted
# but if we're inside a quoted field, that's invalid
code |= INVALID_QUOTED_FIELD
end
@goto donedone
end
@inbounds b = buf[pos] - UInt8('0')
if b > 0x09
# detected a non-digit, time to bail on value parsing
if sentinel !== nothing && sentinel !== missing && sentinelpos >= pos
pos = sentinelpos
code |= SENTINEL
else
code |= OK
end
x = ifelse(neg, -x, x)
@goto donevalue
end
x > overflowval(T) && break
end
# extra loop because we got too close to overflowing while parsing digits
while true
x, ov_mul = Base.mul_with_overflow(x, T(10))
x, ov_add = Base.add_with_overflow(x, T(b))
if ov_mul | ov_add
# we overflowed, check for valid sentinel, otherwise mark as OVERFLOW
if sentinel !== nothing && sentinel !== missing && sentinelpos >= pos
pos = sentinelpos
code |= SENTINEL
else
code |= OVERFLOW
end
@goto donevalue
end
pos += 1
if pos == eof
if sentinel !== nothing && sentinel !== missing && sentinelpos >= pos
pos = sentinelpos
code |= SENTINEL | EOF
else
code |= OK | EOF
end
x = ifelse(neg, -x, x)
if quoted
code |= INVALID_QUOTED_FIELD
end
@goto donedone
end
@inbounds b = buf[pos] - UInt8('0')
if b > 0x09
if sentinel !== nothing && sentinel !== missing && sentinelpos >= pos
pos = sentinelpos
code |= SENTINEL
else
code |= OK
end
x = ifelse(neg, -x, x)
@goto donevalue
end
end
@label donevalue
# donevalue means we finished parsing a value or sentinel, but didn't reach eof, b is still the current byte
# strip trailing whitespace
while b == wh1 || b == wh2
pos += 1
if pos == eof
code |= EOF
@goto donedone
end
@inbounds b = buf[pos]
end
# for quoted fields, find the closing quote character
# we should be positioned at the correct place to find the closing quote character if everything is as it should be
# if we don't find the quote character immediately, something's wrong, so mark INVALID
if quoted
same = cq == e
first = true
while true
pos += 1
if same && b == e
if pos == eof
code |= EOF
if !first
code |= INVALID
end
@goto donedone
elseif buf[pos] != cq
if !first
code |= INVALID
end
break
end
pos += 1
elseif b == e
if pos == eof
code |= INVALID_QUOTED_FIELD | EOF
@goto donedone
end
pos += 1
elseif b == cq
if !first
code |= INVALID
end
break
end
if pos == eof
code |= INVALID_QUOTED_FIELD | EOF
@goto donedone
end
first = false
@inbounds b = buf[pos]
end
@inbounds b = buf[pos]
# ignore whitespace after quoted field
while b == wh1 || b == wh2
pos += 1
if pos == eof
code |= EOF
@goto donedone
end
@inbounds b = buf[pos]
end
end
# now we check for a delimiter; if we don't find it, keep parsing until we do
if !ignorerepeated
if checkdelim(delim, b, buf, pos, eof)
# found the delimiter we were looking for
code |= DELIMITED
@goto donedone
end
else
matched = false
while checkdelim(delim, b, buf, pos, eof)
matched = true
pos += delimlen(delim)
if pos >= eof
code |= EOF
break
end
@inbounds b = buf[pos]
end
if matched
code |= DELIMITED
@goto donedone
end
end
# didn't find delimiter, but let's check for a newline character
if b == UInt8('\n')
code |= NEWLINE | ifelse(pos + 1 == eof, EOF, SUCCESS)
@goto donedone
elseif b == UInt8('\r')
if buf[pos + 1] == UInt8('\n')
pos += 1
end
code |= NEWLINE | ifelse(pos + 1 == eof, EOF, SUCCESS)
@goto donedone
end
# didn't find delimiter or newline, so we're invalid, keep parsing until we find delimiter, newline, or eof
while true
pos += 1
if pos == eof
code |= EOF | INVALID_DELIMITER
end
@inbounds b = buf[pos]
if !ignorerepeated
if checkdelim(delim, b, buf, pos, eof)
# found the delimiter we were looking for
code |= DELIMITED | INVALID_DELIMITER
@goto donedone
end
else
matched = false
while checkdelim(delim, b, buf, pos, eof)
matched = true
pos += delimlen(delim)
if pos >= eof
code |= EOF
break
end
@inbounds b = buf[pos]
end
if matched
code |= DELIMITED | INVALID_DELIMITER
@goto donedone
end
end
# didn't find delimiter, but let's check for a newline character
if b == UInt8('\n')
code |= NEWLINE | ifelse(pos + 1 == eof, EOF, SUCCESS)
@goto donedone
elseif b == UInt8('\r')
if buf[pos + 1] == UInt8('\n')
pos += 1
end
code |= NEWLINE | ifelse(pos + 1 == eof, EOF, SUCCESS)
@goto donedone
end
end
@label donedone
return x, code, pos - startpos
end
end # module
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment