simonster/divinverse.jl

## divinverse.jl
import Base.rem, Base.div, Base.divrem

unsigned_type(::Int8) = Uint8
unsigned_type(::Int16) = Uint16
unsigned_type(::Int32) = Uint32
unsigned_type(::Int64) = Uint64
unsigned_type(::Int128) = Uint128
immutable SignedMultiplicativeInverse{T<:Signed}
    divisor::T
    multiplier::T
    addmul::Int8
    shift::Uint8

    function SignedMultiplicativeInverse(d::T)
        ut = unsigned_type(d)
        signedmin::ut = typemin(d)

        ad::ut = abs(d)
        ad <= 1 && error("cannot compute magic for d == $d")
        t::ut = signedmin + signbit(d)
        anc::ut = t - 1 - rem(t, ad)   # absolute value of nc
        p = sizeof(d)*8 - 1            # initialize p
        q1::ut, r1::ut = divrem(signedmin, anc)
        q2::ut, r2::ut = divrem(signedmin, ad)
        while true
            p += 1
            q1 *= 2                    # update q1 = 2p/abs(nc)
            r1 *= 2                    # update r1 = rem(2p/abs(nc))
            if r1 >= anc               # must be unsigned comparison
                q1 += 1
                r1 -= anc
            end
            q2 *= 2                    # update q2 = 2p/abs(d)
            r2 *= 2                    # update r2 = rem(2p/abs(d))
            if r2 >= ad                # must be unsigned comparison
                q2 += 1
                r2 -= ad
            end
            delta::ut = ad - r2
            (q1 < delta || (q1 == delta && r1 == 0)) || break
        end

        m = flipsign(oftype(d, q2 + 1), d)    # resulting magic number
        s = p - sizeof(d)*8                   # resulting shift
        new(d, m, d > 0 && m < 0 ? int8(1) : d < 0 && m > 0 ? int8(-1) : int8(0), uint8(s))
    end
end
SignedMultiplicativeInverse(x::Signed) = SignedMultiplicativeInverse{typeof(x)}(x)

immutable UnsignedMultiplicativeInverse{T<:Unsigned}
    divisor::T
    multiplier::T
    add::Bool
    shift::Uint8

    function UnsignedMultiplicativeInverse(d::T)
        d == 0 && error("cannot compute magic for d == 0")
        u2 = convert(T, 2)
        add = false
        signedmin::typeof(d) = one(d) << (sizeof(d)*8-1)
        signedmax::typeof(d) = signedmin - 1
        allones::typeof(d) = zero(d) - 1

        nc::typeof(d) = allones - rem(convert(T, allones - d), d)
        p = 8*sizeof(d) - 1                    # initialize p
        q1::typeof(d), r1::typeof(d) = divrem(signedmin, nc)
        q2::typeof(d), r2::typeof(d) = divrem(signedmax, d)
        while true
            p += 1
            if r1 >= convert(T, nc - r1)
                q1 = q1 + q1 + 1        # update q1
                r1 = r1 + r1 - nc       # update r1
            else
                q1 = q1 + q1            # update q1
                r1 = r1 + r1            # update r1
            end
            if convert(T, r2 + 1) >= convert(T, d - r2)
                add |= q2 >= signedmax
                q2 = q2 + q2 + 1        # update q2
                r2 = r2 + r2 + 1 - d    # update r2
            else
                add |= q2 >= signedmin
                q2 = q2 + q2            # update q2
                r2 = r2 + r2 + 1        # update r2
            end
            delta::typeof(d) = d - 1 - r2
            (p < sizeof(d)*16 && (q1 < delta || (q1 == delta && r1 == 0))) || break
        end
        m = q2 + 1                   # resulting magic number
        s = p - sizeof(d)*8 - add    # resulting shift
        new(d, convert(typeof(d), m), add, uint8(s))
    end
end
UnsignedMultiplicativeInverse(x::Unsigned) = UnsignedMultiplicativeInverse{typeof(x)}(x)

function div{T}(a::T, b::SignedMultiplicativeInverse{T})
    x = convert(T, (widen(a)*b.multiplier) >>> sizeof(a)*8)
    x = convert(T, x + convert(T, a*b.addmul))
    convert(T, signbit(x) + (x >> b.shift))
end
function div{T}(a::T, b::UnsignedMultiplicativeInverse{T})
    x = convert(T, (widen(a)*b.multiplier) >>> sizeof(a)*8)
    x = ifelse(b.add, convert(T, convert(T, (convert(T, a - x) >>> 1)) + x), x)
    x >>> b.shift
end

rem{T}(a::T, b::Union(SignedMultiplicativeInverse{T}, UnsignedMultiplicativeInverse{T})) =
    a - div(a, b)*b.divisor

function divrem{T}(a::T, b::Union(SignedMultiplicativeInverse{T}, UnsignedMultiplicativeInverse{T}))
    d = div(a, b)
    (d, a - d*b.divisor)
end

multiplicativeinverse(x::Signed) = SignedMultiplicativeInverse(x)
multiplicativeinverse(x::Unsigned) = UnsignedMultiplicativeInverse(x)

A = rand(Int64, 10^8)
B = rand(Uint64, 10^8)
n = rand(Int64)
m = rand(Uint64)
x1 = div(A, n)
x2 = div(B, m)

println("using signed intrinsics")
gc()
gc_disable()
@time for i = 1:2; div(A, n); end
gc_enable()
println("using unsigned intrinsics")
gc()
gc_disable()
@time for i = 1:2; div(B, m); end
gc_enable()

import Core.Intrinsics.llvmcall
function unsafe_div(num::Array{Int64}, den::Int64)
    out = similar(num)
    isempty(num) && return out
    den == 0 && throw(DivideError())

    @inbounds for i = 1:length(out)
        out[i] = llvmcall("""%3 = sdiv i64 %0, %1
                             ret i64 %3""",
                          Int64, (Int64, Int64), num[i], den)
    end
    out
end
function unsafe_div(num::Array{Uint64}, den::Uint64)
    out = similar(num)
    isempty(num) && return out
    den == 0 && throw(DivideError())

    @inbounds for i = 1:length(out)
        out[i] = llvmcall("""%3 = udiv i64 %0, %1
                             ret i64 %3""",
                          Uint64, (Uint64, Uint64), num[i], den)
    end
    out
end
unsafe_div(A, n)
unsafe_div(B, m)
println("signed without check")
gc()
gc_disable()
@time for i = 1:2; unsafe_div(A, n); end
gc_enable()
println("unsigned without check")
gc()
gc_disable()
@time for i = 1:2; unsafe_div(B, m); end
gc_enable()

function div{T<:Signed}(num::StridedArray{T}, den::T)
    out = similar(num)
    isempty(num) && return out

    if den == 0
        throw(DivideError())
    elseif den == 1
        copy!(out, num)
    elseif den == -1
        for i = 1:length(out)
            @inbounds out[i] = -num[i]
        end
    else
        m = multiplicativeinverse(den)
        @inbounds if m.addmul == -1
            for i = 1:length(out)
                x = convert(T, convert(T, (widen(num[i])*m.multiplier) >>> sizeof(T)*8) - num[i])
                out[i] = signbit(x) + (x >> m.shift)
            end
        elseif m.addmul == 1
            for i = 1:length(out)
                x = convert(T, convert(T, (widen(num[i])*m.multiplier) >>> sizeof(T)*8) + num[i])
                out[i] = signbit(x) + (x >> m.shift)
            end
        else
            for i = 1:length(out)
                x = convert(T, (widen(num[i])*m.multiplier) >>> sizeof(T)*8)
                out[i] = signbit(x) + (x >> m.shift)
            end
        end
    end
    out
end

function div{T<:Unsigned}(num::StridedArray{T}, den::T)
    out = similar(num)
    isempty(num) && return out

    if den == 0
        throw(DivideError())
    else
        m = multiplicativeinverse(den)
        @inbounds if m.add
            for i = 1:length(out)
                x = convert(T, (widen(num[i])*m.multiplier) >>> sizeof(T)*8)
                x = convert(T, convert(T, (convert(T, num[i] - x) >>> 1)) + x)
                out[i] = (x >>> m.shift)
            end
        else
            for i = 1:length(out)
                x = convert(T, (widen(num[i])*m.multiplier) >>> sizeof(T)*8)
                out[i] = (x >>> m.shift)
            end
        end
    end
    out
end

y1 = div(A, n)
y2 = div(B, m)
println("using signed inverse")
gc()
gc_disable()
@time for i = 1:2; div(A, n); end
gc_enable()
println("using unsigned inverse")
gc()
gc_disable()
@time for i = 1:2; div(B, m); end
gc_enable()

@assert x1 == y1
@assert x2 == y2

## output
using signed intrinsics
elapsed time: 1.690360625 seconds (1600000096 bytes allocated)
using unsigned intrinsics
elapsed time: 1.558626973 seconds (1600000096 bytes allocated)
signed without check
elapsed time: 1.493279208 seconds (1600000096 bytes allocated)
unsigned without check
elapsed time: 1.263156649 seconds (1600000096 bytes allocated)
using signed inverse
elapsed time: 0.525588555 seconds (1600000096 bytes allocated)
using unsigned inverse
elapsed time: 0.485345448 seconds (1600000096 bytes allocated)
	import Base.rem, Base.div, Base.divrem

	unsigned_type(::Int8) = Uint8
	unsigned_type(::Int16) = Uint16
	unsigned_type(::Int32) = Uint32
	unsigned_type(::Int64) = Uint64
	unsigned_type(::Int128) = Uint128
	immutable SignedMultiplicativeInverse{T<:Signed}
	divisor::T
	multiplier::T
	addmul::Int8
	shift::Uint8

	function SignedMultiplicativeInverse(d::T)
	ut = unsigned_type(d)
	signedmin::ut = typemin(d)

	ad::ut = abs(d)
	ad <= 1 && error("cannot compute magic for d == $d")
	t::ut = signedmin + signbit(d)
	anc::ut = t - 1 - rem(t, ad) # absolute value of nc
	p = sizeof(d)*8 - 1 # initialize p
	q1::ut, r1::ut = divrem(signedmin, anc)
	q2::ut, r2::ut = divrem(signedmin, ad)
	while true
	p += 1
	q1 *= 2 # update q1 = 2p/abs(nc)
	r1 *= 2 # update r1 = rem(2p/abs(nc))
	if r1 >= anc # must be unsigned comparison
	q1 += 1
	r1 -= anc
	end
	q2 *= 2 # update q2 = 2p/abs(d)
	r2 *= 2 # update r2 = rem(2p/abs(d))
	if r2 >= ad # must be unsigned comparison
	q2 += 1
	r2 -= ad
	end
	delta::ut = ad - r2
	(q1 < delta \|\| (q1 == delta && r1 == 0)) \|\| break
	end

	m = flipsign(oftype(d, q2 + 1), d) # resulting magic number
	s = p - sizeof(d)*8 # resulting shift
	new(d, m, d > 0 && m < 0 ? int8(1) : d < 0 && m > 0 ? int8(-1) : int8(0), uint8(s))
	end
	end
	SignedMultiplicativeInverse(x::Signed) = SignedMultiplicativeInverse{typeof(x)}(x)

	immutable UnsignedMultiplicativeInverse{T<:Unsigned}
	divisor::T
	multiplier::T
	add::Bool
	shift::Uint8

	function UnsignedMultiplicativeInverse(d::T)
	d == 0 && error("cannot compute magic for d == 0")
	u2 = convert(T, 2)
	add = false
	signedmin::typeof(d) = one(d) << (sizeof(d)*8-1)
	signedmax::typeof(d) = signedmin - 1
	allones::typeof(d) = zero(d) - 1

	nc::typeof(d) = allones - rem(convert(T, allones - d), d)
	p = 8*sizeof(d) - 1 # initialize p
	q1::typeof(d), r1::typeof(d) = divrem(signedmin, nc)
	q2::typeof(d), r2::typeof(d) = divrem(signedmax, d)
	while true
	p += 1
	if r1 >= convert(T, nc - r1)
	q1 = q1 + q1 + 1 # update q1
	r1 = r1 + r1 - nc # update r1
	else
	q1 = q1 + q1 # update q1
	r1 = r1 + r1 # update r1
	end
	if convert(T, r2 + 1) >= convert(T, d - r2)
	add \|= q2 >= signedmax
	q2 = q2 + q2 + 1 # update q2
	r2 = r2 + r2 + 1 - d # update r2
	else
	add \|= q2 >= signedmin
	q2 = q2 + q2 # update q2
	r2 = r2 + r2 + 1 # update r2
	end
	delta::typeof(d) = d - 1 - r2
	(p < sizeof(d)*16 && (q1 < delta \|\| (q1 == delta && r1 == 0))) \|\| break
	end
	m = q2 + 1 # resulting magic number
	s = p - sizeof(d)*8 - add # resulting shift
	new(d, convert(typeof(d), m), add, uint8(s))
	end
	end
	UnsignedMultiplicativeInverse(x::Unsigned) = UnsignedMultiplicativeInverse{typeof(x)}(x)

	function div{T}(a::T, b::SignedMultiplicativeInverse{T})
	x = convert(T, (widen(a)b.multiplier) >>> sizeof(a)8)
	x = convert(T, x + convert(T, a*b.addmul))
	convert(T, signbit(x) + (x >> b.shift))
	end
	function div{T}(a::T, b::UnsignedMultiplicativeInverse{T})
	x = convert(T, (widen(a)b.multiplier) >>> sizeof(a)8)
	x = ifelse(b.add, convert(T, convert(T, (convert(T, a - x) >>> 1)) + x), x)
	x >>> b.shift
	end

	rem{T}(a::T, b::Union(SignedMultiplicativeInverse{T}, UnsignedMultiplicativeInverse{T})) =
	a - div(a, b)*b.divisor

	function divrem{T}(a::T, b::Union(SignedMultiplicativeInverse{T}, UnsignedMultiplicativeInverse{T}))
	d = div(a, b)
	(d, a - d*b.divisor)
	end

	multiplicativeinverse(x::Signed) = SignedMultiplicativeInverse(x)
	multiplicativeinverse(x::Unsigned) = UnsignedMultiplicativeInverse(x)

	A = rand(Int64, 10^8)
	B = rand(Uint64, 10^8)
	n = rand(Int64)
	m = rand(Uint64)
	x1 = div(A, n)
	x2 = div(B, m)

	println("using signed intrinsics")
	gc()
	gc_disable()
	@time for i = 1:2; div(A, n); end
	gc_enable()
	println("using unsigned intrinsics")
	gc()
	gc_disable()
	@time for i = 1:2; div(B, m); end
	gc_enable()

	import Core.Intrinsics.llvmcall
	function unsafe_div(num::Array{Int64}, den::Int64)
	out = similar(num)
	isempty(num) && return out
	den == 0 && throw(DivideError())

	@inbounds for i = 1:length(out)
	out[i] = llvmcall("""%3 = sdiv i64 %0, %1
	ret i64 %3""",
	Int64, (Int64, Int64), num[i], den)
	end
	out
	end
	function unsafe_div(num::Array{Uint64}, den::Uint64)
	out = similar(num)
	isempty(num) && return out
	den == 0 && throw(DivideError())

	@inbounds for i = 1:length(out)
	out[i] = llvmcall("""%3 = udiv i64 %0, %1
	ret i64 %3""",
	Uint64, (Uint64, Uint64), num[i], den)
	end
	out
	end
	unsafe_div(A, n)
	unsafe_div(B, m)
	println("signed without check")
	gc()
	gc_disable()
	@time for i = 1:2; unsafe_div(A, n); end
	gc_enable()
	println("unsigned without check")
	gc()
	gc_disable()
	@time for i = 1:2; unsafe_div(B, m); end
	gc_enable()

	function div{T<:Signed}(num::StridedArray{T}, den::T)
	out = similar(num)
	isempty(num) && return out

	if den == 0
	throw(DivideError())
	elseif den == 1
	copy!(out, num)
	elseif den == -1
	for i = 1:length(out)
	@inbounds out[i] = -num[i]
	end
	else
	m = multiplicativeinverse(den)
	@inbounds if m.addmul == -1
	for i = 1:length(out)
	x = convert(T, convert(T, (widen(num[i])m.multiplier) >>> sizeof(T)8) - num[i])
	out[i] = signbit(x) + (x >> m.shift)
	end
	elseif m.addmul == 1
	for i = 1:length(out)
	x = convert(T, convert(T, (widen(num[i])m.multiplier) >>> sizeof(T)8) + num[i])
	out[i] = signbit(x) + (x >> m.shift)
	end
	else
	for i = 1:length(out)
	x = convert(T, (widen(num[i])m.multiplier) >>> sizeof(T)8)
	out[i] = signbit(x) + (x >> m.shift)
	end
	end
	end
	out
	end

	function div{T<:Unsigned}(num::StridedArray{T}, den::T)
	out = similar(num)
	isempty(num) && return out

	if den == 0
	throw(DivideError())
	else
	m = multiplicativeinverse(den)
	@inbounds if m.add
	for i = 1:length(out)
	x = convert(T, (widen(num[i])m.multiplier) >>> sizeof(T)8)
	x = convert(T, convert(T, (convert(T, num[i] - x) >>> 1)) + x)
	out[i] = (x >>> m.shift)
	end
	else
	for i = 1:length(out)
	x = convert(T, (widen(num[i])m.multiplier) >>> sizeof(T)8)
	out[i] = (x >>> m.shift)
	end
	end
	end
	out
	end

	y1 = div(A, n)
	y2 = div(B, m)
	println("using signed inverse")
	gc()
	gc_disable()
	@time for i = 1:2; div(A, n); end
	gc_enable()
	println("using unsigned inverse")
	gc()
	gc_disable()
	@time for i = 1:2; div(B, m); end
	gc_enable()

	@assert x1 == y1
	@assert x2 == y2
	using signed intrinsics
	elapsed time: 1.690360625 seconds (1600000096 bytes allocated)
	using unsigned intrinsics
	elapsed time: 1.558626973 seconds (1600000096 bytes allocated)
	signed without check
	elapsed time: 1.493279208 seconds (1600000096 bytes allocated)
	unsigned without check
	elapsed time: 1.263156649 seconds (1600000096 bytes allocated)
	using signed inverse
	elapsed time: 0.525588555 seconds (1600000096 bytes allocated)
	using unsigned inverse
	elapsed time: 0.485345448 seconds (1600000096 bytes allocated)