nalimilan/stringindex.jl

## stringindex.jl
import Base.<, Base.next, Base.nextind, Base.prevind, Base.colon, Base.length, Base.getindex, Base.first, Base.last,
       Base.is_utf8_start, Base.utf8_trailing, Base.utf8_offset

# Simple index with string plus byte offset
immutable StringIndex{S<:AbstractString}
    str::S
    i::Int
end

<(a::StringIndex, b::StringIndex) = a.i < b.i

function next(s::UTF8String, idx::StringIndex{UTF8String})
    # potentially faster version
    # d = s.data
    # a::UInt32 = d[i]
    # if a < 0x80; return char(a); end
    # #if a&0xc0==0x80; return '\ufffd'; end
    # b::UInt32 = a<<6 + d[i+1]
    # if a < 0xe0; return char(b - 0x00003080); end
    # c::UInt32 = b<<6 + d[i+2]
    # if a < 0xf0; return char(c - 0x000e2080); end
    # return char(c<<6 + d[i+3] - 0x03c82080)

    i = idx.i
    d = s.data
    b = d[i]
    if !is_utf8_start(b)
        j = i-1
        while 0 < j && !is_utf8_start(d[j])
            j -= 1
        end
        if 0 < j && i <= j+utf8_trailing[d[j]+1] <= length(d)
            # b is a continuation byte of a valid UTF-8 character
            error("invalid UTF-8 character index")
        end
        # move past 1 byte in case the data is actually Latin-1
        return '\ufffd', i+1
    end
    trailing = utf8_trailing[b+1]
    if length(d) < i + trailing
        return '\ufffd', i+1
    end
    c::UInt32 = 0
    for j = 1:trailing+1
        c <<= 6
        c += d[i]
        i += 1
    end
    c -= utf8_offset[trailing+1]
    char(c), i
end

getindex{S<:AbstractString}(s::S, i::StringIndex{S}) = next(s,i)[1]

nextind(i::StringIndex) = StringIndex(i.str, nextind(i.str, i.i))
prevind(i::StringIndex) = StringIndex(i.str, prevind(i.str, i.i))

function +(si::StringIndex, j::Integer)
    j < 0 && return si - (-j)
    i = si.i
    while j > 0
        i = nextind(si.str, i)
        j -= 1
    end
    return StringIndex(si.str, i)
end

function -(si::StringIndex, j::Integer)
    j < 0 && return si + (-j)
    i = si.i
    while j > 0
        i = prevind(si.str, i)
        j -= 1
    end
    return StringIndex(si.str, i)
end


# Variant with byte index + character offset
immutable StringIndexOffset{S<:AbstractString}
    i::Int
    offset::Int
end

# StringIndexOffset{S<:AbstractString}(i::Int) = StringIndexOffset{S}(i, 0)

function <(a::StringIndexOffset, b::StringIndexOffset)
    a.i == b.i || throw(ArgumentError())
    return a.offset < b.offset
end

@inline function byte_index(s::UTF8String, idx::StringIndexOffset{UTF8String})
    i = idx.i
    idx.offset == 0 && return i

    if idx.offset > 0
        for j in 1:idx.offset
            i = nextind(s, i)
        end
    elseif idx.offset < 0
        for j in idx.offset:1
            i = prevind(s, i)
        end
    end
    return i
end

function next(s::UTF8String, idx::StringIndexOffset{UTF8String})
    # potentially faster version
    # d = s.data
    # a::UInt32 = d[i]
    # if a < 0x80; return char(a); end
    # #if a&0xc0==0x80; return '\ufffd'; end
    # b::UInt32 = a<<6 + d[i+1]
    # if a < 0xe0; return char(b - 0x00003080); end
    # c::UInt32 = b<<6 + d[i+2]
    # if a < 0xf0; return char(c - 0x000e2080); end
    # return char(c<<6 + d[i+3] - 0x03c82080)

    i = byte_index(s, idx)
    d = s.data
    b = d[i]
    if !is_utf8_start(b)
        j = i-1
        while 0 < j && !is_utf8_start(d[j])
            j -= 1
        end
        if 0 < j && i <= j+utf8_trailing[d[j]+1] <= length(d)
            # b is a continuation byte of a valid UTF-8 character
            error("invalid UTF-8 character index")
        end
        # move past 1 byte in case the data is actually Latin-1
        return '\ufffd', StringIndexOffset{UTF8String}(i+1, 0)
    end
    trailing = utf8_trailing[b+1]
    if length(d) < i + trailing
        return '\ufffd', StringIndexOffset{UTF8String}(i+1, 0)
    end
    c::UInt32 = 0
    for j = 1:trailing+1
        c <<= 6
        c += d[i]
        i += 1
    end
    c -= utf8_offset[trailing+1]
    char(c), StringIndexOffset{UTF8String}(i, 0)
end

getindex{S<:AbstractString}(s::S, i::StringIndexOffset{S}) = next(s,i)[1]

nextind{S<:AbstractString}(s::S, i::StringIndexOffset{S}) = StringIndexOffset{S}(byte_index(s, nextind(i)), 0)
prevind{S<:AbstractString}(s::S, i::StringIndexOffset{S}) = StringIndexOffset{S}(byte_index(s, nextind(i)), 0)

nextind{S<:AbstractString}(i::StringIndexOffset{S}) = i + 1
prevind{S<:AbstractString}(i::StringIndexOffset{S}) = i - 1

+{S<:AbstractString}(i::StringIndexOffset{S}, j::Integer) = StringIndexOffset{S}(i.i, i.offset + j)
-{S<:AbstractString}(i::StringIndexOffset{S}, j::Integer) = StringIndexOffset{S}(i.i, i.offset - j)


# More efficient version using pointers
immutable StringIndexPointer{S<:AbstractString}
    ptr::Ptr{Uint8}
    # Required to check lower bound
    start::Ptr{Uint8}
end

<(a::StringIndexPointer, b::StringIndexPointer) = a.ptr < b.ptr

isvalid(i::StringIndexPointer) = (i.start <= i.ptr && is_utf8_start(unsafe_load(i.ptr)))

function nextind{S<:AbstractString}(i::StringIndexPointer{S})
    p = i.ptr + 1
    while unsafe_load(p) != 0
        idx = StringIndexPointer{UTF8String}(p, i.start)
        if isvalid(idx)
            return idx
        end
        p += 1
    end
    throw(BoundsError())
end

function prevind{S<:AbstractString}(i::StringIndexPointer{S})
    p = i.ptr
    while p >= i.ptr
        idx = StringIndexPointer{UTF8String}(p, i.start)
        if isvalid(idx)
            return idx
        end
        p -= 1
    end
    throw(BoundsError())
end

function +(idx::StringIndexPointer{UTF8String}, j::Integer)
    j < 0 && return idx - (-j)
    while j > 0
        idx = nextind(idx)
        j -= 1
    end
    return idx
end

function -(idx::StringIndexPointer{UTF8String}, j::Integer)
    j < 0 && return idx + (-j)
    while j > 0
        idx = prevind(idx)
        j -= 1
    end
    return idx
end

function next(s::UTF8String, idx::StringIndexPointer{UTF8String})
    # potentially faster version
    # d = s.data
    # a::UInt32 = d[i]
    # if a < 0x80; return char(a); end
    # #if a&0xc0==0x80; return '\ufffd'; end
    # b::UInt32 = a<<6 + d[i+1]
    # if a < 0xe0; return char(b - 0x00003080); end
    # c::UInt32 = b<<6 + d[i+2]
    # if a < 0xf0; return char(c - 0x000e2080); end
    # return char(c<<6 + d[i+3] - 0x03c82080)

    startptr = pointer(s)
    endptr = startptr + endof(s)

# Not checked in current implementation either
#    startptr <= idx.ptr <= endptr || throw(BoundsError())

    i = idx.ptr
    b = unsafe_load(i, 1)
    if !is_utf8_start(b)
        j = i - 1
        while startptr < j && !is_utf8_start(unsafe_load(j, 1))
            j -= 1
        end
        if startptr < j && i <= j + utf8_trailing[unsafe_load(j, 1) + 1] - 1 <= endptr
            # b is a continuation byte of a valid UTF-8 character
            error("invalid UTF-8 character index")
        end
        # move past 1 byte in case the data is actually Latin-1
        return '\ufffd', StringIndexPointer{UTF8String}(i+1, startptr)
    end
    trailing = utf8_trailing[b+1]
    if endptr < i + trailing - 1
        return '\ufffd', StringIndexPointer{UTF8String}(i+1, startptr)
    end
    c::UInt32 = 0
    for k = 1:trailing+1
        c <<= 6
        c += unsafe_load(i, 1)
        i += 1
    end
    c -= utf8_offset[trailing+1]
    char(c), StringIndexPointer{UTF8String}(i, startptr)
end

getindex{S<:AbstractString}(s::S, i::StringIndexPointer{S}) = next(s,i)[1]


x = "café crème"
i = StringIndexPointer{UTF8String}(pointer(x, 4), pointer(x))
j = StringIndex(x, 4)
k = StringIndexOffset{UTF8String}(4, 0)


f_ind(x, i) = for l in 1:100000 x[i] end
f_nextind(x, i) = for l in 1:100000 x[nextind(i)] end
f_plusone(x, i) = for l in 1:100000 x[i+1] end
g(x, i) = for l in 1:100000 x[nextind(x, i)] end

f_ind(x, i); f_nextind(x, i); f_plusone(x, i);
f_ind(x, j); f_nextind(x, j); f_plusone(x, j);
f_ind(x, k); f_nextind(x, k); f_plusone(x, k);
f_ind(x, 4); g(x, k); g(x, 4);

@time for l in 1:100 f_ind(x, i) end
@time for l in 1:100 f_nextind(x, i) end
@time for l in 1:100 f_plusone(x, i) end
@time for l in 1:100 f_ind(x, j) end
@time for l in 1:100 f_nextind(x, j) end
@time for l in 1:100 f_plusone(x, j) end
@time for l in 1:100 f_ind(x, k) end
@time for l in 1:100 f_nextind(x, k) end
@time for l in 1:100 f_plusone(x, k) end
@time for l in 1:100 g(x, k) end
@time for l in 1:100 f_ind(x, 4) end
@time for l in 1:100 g(x, 4) end


# julia> @time for l in 1:100 f_ind(x, i) end
# elapsed time: 0.105896672 seconds (0 bytes allocated)

# julia> @time for l in 1:100 f_nextind(x, i) end
# elapsed time: 0.199674082 seconds (0 bytes allocated)

# julia> @time for l in 1:100 f_plusone(x, i) end
# elapsed time: 0.238348901 seconds (0 bytes allocated)

# julia> @time for l in 1:100 f_ind(x, j) end
# elapsed time: 0.098285909 seconds (0 bytes allocated)

# julia> @time for l in 1:100 f_nextind(x, j) end
# elapsed time: 0.621622011 seconds (240000000 bytes allocated, 38.06% gc time)

# julia> @time for l in 1:100 f_plusone(x, j) end
# elapsed time: 0.655069184 seconds (240000000 bytes allocated, 34.15% gc time)

# julia> @time for l in 1:100 f_ind(x, k) end
# elapsed time: 0.109709424 seconds (0 bytes allocated)

# julia> @time for l in 1:100 f_nextind(x, k) end
# elapsed time: 0.28053811 seconds (0 bytes allocated)

# julia> @time for l in 1:100 f_plusone(x, k) end
# elapsed time: 0.279604258 seconds (0 bytes allocated)

# julia> @time for l in 1:100 g(x, k) end
# elapsed time: 0.317775374 seconds (0 bytes allocated)

# julia> @time for l in 1:100 f_ind(x, 4) end
# elapsed time: 0.089790495 seconds (0 bytes allocated)

# julia> @time for l in 1:100 g(x, 4) end
# elapsed time: 0.24096117 seconds (0 bytes allocated)
	import Base.<, Base.next, Base.nextind, Base.prevind, Base.colon, Base.length, Base.getindex, Base.first, Base.last,
	Base.is_utf8_start, Base.utf8_trailing, Base.utf8_offset

	# Simple index with string plus byte offset
	immutable StringIndex{S<:AbstractString}
	str::S
	i::Int
	end

	<(a::StringIndex, b::StringIndex) = a.i < b.i

	function next(s::UTF8String, idx::StringIndex{UTF8String})
	# potentially faster version
	# d = s.data
	# a::UInt32 = d[i]
	# if a < 0x80; return char(a); end
	# #if a&0xc0==0x80; return '\ufffd'; end
	# b::UInt32 = a<<6 + d[i+1]
	# if a < 0xe0; return char(b - 0x00003080); end
	# c::UInt32 = b<<6 + d[i+2]
	# if a < 0xf0; return char(c - 0x000e2080); end
	# return char(c<<6 + d[i+3] - 0x03c82080)

	i = idx.i
	d = s.data
	b = d[i]
	if !is_utf8_start(b)
	j = i-1
	while 0 < j && !is_utf8_start(d[j])
	j -= 1
	end
	if 0 < j && i <= j+utf8_trailing[d[j]+1] <= length(d)
	# b is a continuation byte of a valid UTF-8 character
	error("invalid UTF-8 character index")
	end
	# move past 1 byte in case the data is actually Latin-1
	return '\ufffd', i+1
	end
	trailing = utf8_trailing[b+1]
	if length(d) < i + trailing
	return '\ufffd', i+1
	end
	c::UInt32 = 0
	for j = 1:trailing+1
	c <<= 6
	c += d[i]
	i += 1
	end
	c -= utf8_offset[trailing+1]
	char(c), i
	end

	getindex{S<:AbstractString}(s::S, i::StringIndex{S}) = next(s,i)[1]

	nextind(i::StringIndex) = StringIndex(i.str, nextind(i.str, i.i))
	prevind(i::StringIndex) = StringIndex(i.str, prevind(i.str, i.i))

	function +(si::StringIndex, j::Integer)
	j < 0 && return si - (-j)
	i = si.i
	while j > 0
	i = nextind(si.str, i)
	j -= 1
	end
	return StringIndex(si.str, i)
	end

	function -(si::StringIndex, j::Integer)
	j < 0 && return si + (-j)
	i = si.i
	while j > 0
	i = prevind(si.str, i)
	j -= 1
	end
	return StringIndex(si.str, i)
	end


	# Variant with byte index + character offset
	immutable StringIndexOffset{S<:AbstractString}
	i::Int
	offset::Int
	end

	# StringIndexOffset{S<:AbstractString}(i::Int) = StringIndexOffset{S}(i, 0)

	function <(a::StringIndexOffset, b::StringIndexOffset)
	a.i == b.i \|\| throw(ArgumentError())
	return a.offset < b.offset
	end

	@inline function byte_index(s::UTF8String, idx::StringIndexOffset{UTF8String})
	i = idx.i
	idx.offset == 0 && return i

	if idx.offset > 0
	for j in 1:idx.offset
	i = nextind(s, i)
	end
	elseif idx.offset < 0
	for j in idx.offset:1
	i = prevind(s, i)
	end
	end
	return i
	end

	function next(s::UTF8String, idx::StringIndexOffset{UTF8String})
	# potentially faster version
	# d = s.data
	# a::UInt32 = d[i]
	# if a < 0x80; return char(a); end
	# #if a&0xc0==0x80; return '\ufffd'; end
	# b::UInt32 = a<<6 + d[i+1]
	# if a < 0xe0; return char(b - 0x00003080); end
	# c::UInt32 = b<<6 + d[i+2]
	# if a < 0xf0; return char(c - 0x000e2080); end
	# return char(c<<6 + d[i+3] - 0x03c82080)

	i = byte_index(s, idx)
	d = s.data
	b = d[i]
	if !is_utf8_start(b)
	j = i-1
	while 0 < j && !is_utf8_start(d[j])
	j -= 1
	end
	if 0 < j && i <= j+utf8_trailing[d[j]+1] <= length(d)
	# b is a continuation byte of a valid UTF-8 character
	error("invalid UTF-8 character index")
	end
	# move past 1 byte in case the data is actually Latin-1
	return '\ufffd', StringIndexOffset{UTF8String}(i+1, 0)
	end
	trailing = utf8_trailing[b+1]
	if length(d) < i + trailing
	return '\ufffd', StringIndexOffset{UTF8String}(i+1, 0)
	end
	c::UInt32 = 0
	for j = 1:trailing+1
	c <<= 6
	c += d[i]
	i += 1
	end
	c -= utf8_offset[trailing+1]
	char(c), StringIndexOffset{UTF8String}(i, 0)
	end

	getindex{S<:AbstractString}(s::S, i::StringIndexOffset{S}) = next(s,i)[1]

	nextind{S<:AbstractString}(s::S, i::StringIndexOffset{S}) = StringIndexOffset{S}(byte_index(s, nextind(i)), 0)
	prevind{S<:AbstractString}(s::S, i::StringIndexOffset{S}) = StringIndexOffset{S}(byte_index(s, nextind(i)), 0)

	nextind{S<:AbstractString}(i::StringIndexOffset{S}) = i + 1
	prevind{S<:AbstractString}(i::StringIndexOffset{S}) = i - 1

	+{S<:AbstractString}(i::StringIndexOffset{S}, j::Integer) = StringIndexOffset{S}(i.i, i.offset + j)
	-{S<:AbstractString}(i::StringIndexOffset{S}, j::Integer) = StringIndexOffset{S}(i.i, i.offset - j)


	# More efficient version using pointers
	immutable StringIndexPointer{S<:AbstractString}
	ptr::Ptr{Uint8}
	# Required to check lower bound
	start::Ptr{Uint8}
	end

	<(a::StringIndexPointer, b::StringIndexPointer) = a.ptr < b.ptr

	isvalid(i::StringIndexPointer) = (i.start <= i.ptr && is_utf8_start(unsafe_load(i.ptr)))

	function nextind{S<:AbstractString}(i::StringIndexPointer{S})
	p = i.ptr + 1
	while unsafe_load(p) != 0
	idx = StringIndexPointer{UTF8String}(p, i.start)
	if isvalid(idx)
	return idx
	end
	p += 1
	end
	throw(BoundsError())
	end

	function prevind{S<:AbstractString}(i::StringIndexPointer{S})
	p = i.ptr
	while p >= i.ptr
	idx = StringIndexPointer{UTF8String}(p, i.start)
	if isvalid(idx)
	return idx
	end
	p -= 1
	end
	throw(BoundsError())
	end

	function +(idx::StringIndexPointer{UTF8String}, j::Integer)
	j < 0 && return idx - (-j)
	while j > 0
	idx = nextind(idx)
	j -= 1
	end
	return idx
	end

	function -(idx::StringIndexPointer{UTF8String}, j::Integer)
	j < 0 && return idx + (-j)
	while j > 0
	idx = prevind(idx)
	j -= 1
	end
	return idx
	end

	function next(s::UTF8String, idx::StringIndexPointer{UTF8String})
	# potentially faster version
	# d = s.data
	# a::UInt32 = d[i]
	# if a < 0x80; return char(a); end
	# #if a&0xc0==0x80; return '\ufffd'; end
	# b::UInt32 = a<<6 + d[i+1]
	# if a < 0xe0; return char(b - 0x00003080); end
	# c::UInt32 = b<<6 + d[i+2]
	# if a < 0xf0; return char(c - 0x000e2080); end
	# return char(c<<6 + d[i+3] - 0x03c82080)

	startptr = pointer(s)
	endptr = startptr + endof(s)

	# Not checked in current implementation either
	# startptr <= idx.ptr <= endptr \|\| throw(BoundsError())

	i = idx.ptr
	b = unsafe_load(i, 1)
	if !is_utf8_start(b)
	j = i - 1
	while startptr < j && !is_utf8_start(unsafe_load(j, 1))
	j -= 1
	end
	if startptr < j && i <= j + utf8_trailing[unsafe_load(j, 1) + 1] - 1 <= endptr
	# b is a continuation byte of a valid UTF-8 character
	error("invalid UTF-8 character index")
	end
	# move past 1 byte in case the data is actually Latin-1
	return '\ufffd', StringIndexPointer{UTF8String}(i+1, startptr)
	end
	trailing = utf8_trailing[b+1]
	if endptr < i + trailing - 1
	return '\ufffd', StringIndexPointer{UTF8String}(i+1, startptr)
	end
	c::UInt32 = 0
	for k = 1:trailing+1
	c <<= 6
	c += unsafe_load(i, 1)
	i += 1
	end
	c -= utf8_offset[trailing+1]
	char(c), StringIndexPointer{UTF8String}(i, startptr)
	end

	getindex{S<:AbstractString}(s::S, i::StringIndexPointer{S}) = next(s,i)[1]



	x = "café crème"
	i = StringIndexPointer{UTF8String}(pointer(x, 4), pointer(x))
	j = StringIndex(x, 4)
	k = StringIndexOffset{UTF8String}(4, 0)


	f_ind(x, i) = for l in 1:100000 x[i] end
	f_nextind(x, i) = for l in 1:100000 x[nextind(i)] end
	f_plusone(x, i) = for l in 1:100000 x[i+1] end
	g(x, i) = for l in 1:100000 x[nextind(x, i)] end

	f_ind(x, i); f_nextind(x, i); f_plusone(x, i);
	f_ind(x, j); f_nextind(x, j); f_plusone(x, j);
	f_ind(x, k); f_nextind(x, k); f_plusone(x, k);
	f_ind(x, 4); g(x, k); g(x, 4);

	@time for l in 1:100 f_ind(x, i) end
	@time for l in 1:100 f_nextind(x, i) end
	@time for l in 1:100 f_plusone(x, i) end
	@time for l in 1:100 f_ind(x, j) end
	@time for l in 1:100 f_nextind(x, j) end
	@time for l in 1:100 f_plusone(x, j) end
	@time for l in 1:100 f_ind(x, k) end
	@time for l in 1:100 f_nextind(x, k) end
	@time for l in 1:100 f_plusone(x, k) end
	@time for l in 1:100 g(x, k) end
	@time for l in 1:100 f_ind(x, 4) end
	@time for l in 1:100 g(x, 4) end


	# julia> @time for l in 1:100 f_ind(x, i) end
	# elapsed time: 0.105896672 seconds (0 bytes allocated)

	# julia> @time for l in 1:100 f_nextind(x, i) end
	# elapsed time: 0.199674082 seconds (0 bytes allocated)

	# julia> @time for l in 1:100 f_plusone(x, i) end
	# elapsed time: 0.238348901 seconds (0 bytes allocated)

	# julia> @time for l in 1:100 f_ind(x, j) end
	# elapsed time: 0.098285909 seconds (0 bytes allocated)

	# julia> @time for l in 1:100 f_nextind(x, j) end
	# elapsed time: 0.621622011 seconds (240000000 bytes allocated, 38.06% gc time)

	# julia> @time for l in 1:100 f_plusone(x, j) end
	# elapsed time: 0.655069184 seconds (240000000 bytes allocated, 34.15% gc time)

	# julia> @time for l in 1:100 f_ind(x, k) end
	# elapsed time: 0.109709424 seconds (0 bytes allocated)

	# julia> @time for l in 1:100 f_nextind(x, k) end
	# elapsed time: 0.28053811 seconds (0 bytes allocated)

	# julia> @time for l in 1:100 f_plusone(x, k) end
	# elapsed time: 0.279604258 seconds (0 bytes allocated)

	# julia> @time for l in 1:100 g(x, k) end
	# elapsed time: 0.317775374 seconds (0 bytes allocated)

	# julia> @time for l in 1:100 f_ind(x, 4) end
	# elapsed time: 0.089790495 seconds (0 bytes allocated)

	# julia> @time for l in 1:100 g(x, 4) end
	# elapsed time: 0.24096117 seconds (0 bytes allocated)