Skip to content

Instantly share code, notes, and snippets.

@kmsquire
Last active July 11, 2020 16:09
Show Gist options
  • Save kmsquire/5147894 to your computer and use it in GitHub Desktop.
Save kmsquire/5147894 to your computer and use it in GitHub Desktop.
OrderedDict for Julia
n = 100000
strs = [randstring(10) for i = 1:n];
nums = rand(Int, n);
# regular map
function dict_insertion_test{T<:Associative}(d::T)
#empty!(d)
for i = 1:n
d[strs[i]] = nums[i]
end
d
end
function dict_deletion_test{T<:Associative}(d::T, iters::Int)
#dict_insertion_test(d)
for i in rand(1:n, iters)
delete!(d, strs[i], 0)
end
d
end
function dict_ins_del_test{T<:Associative}(d::T, iters::Int)
#dict_insertion_test(d)
for i in rand(1:n, iters)
randbool()? delete!(d, strs[i], 0) : (d[strs[i]] = nums[i])
end
d
end
function test_ins(T::Type, iter::Int)
d = T{String,Int}()
t = 0.0
for i = 1:iter
empty!(d)
gc()
t += @elapsed dict_insertion_test(d)
end
t
end
function test_ins_del(T::Type, iter::Int)
d = T{String,Int}()
t = 0.0
for i = 1:iter
empty!(d)
dict_insertion_test(d)
gc()
t += @elapsed dict_ins_del_test(d, 100000)
end
t
end
function test_del(T::Type, iter::Int)
d = T{String,Int}()
t = 0.0
for i = 1:iter
empty!(d)
dict_insertion_test(d)
gc()
t += @elapsed dict_deletion_test(d, 100000)
end
t
end
function run_all()
for test in [test_ins, test_del, test_ins_del]
println(test)
println("="^length(string(test)))
for T in [Dict, OrderedDict]
print("$T: ")
times = Float64[test(T, 5) for i = 1:5]
println("$times, median=$(median(times))")
end
println()
end
end
Here are 3 implemenations of an `OrderedDict` for julia. The first
creates a `DictItem` class, which is stored as the value in a wrapped
Dict. The second only stores a vector of keys, and the third
reimplements the standard julia hash-based dictionary inline, and adds
just enough information to maintain key-value order.
I had other, similar versions along the way, with minor tweaks that
didn't change the timings much. This includes a version where
DictItem was immutable.
One version I did not implement was a doubly-linked-list version,
which should be much faster at deletion than the first two, but which
is more limited in the operations that can be done with it (e.g., not
shown here is code which allows the dictionary to be sorted or treated
as a dequeue of (key, value) pairs). See
https://github.com/kmsquire/OrderedCollections.jl for a version which
includes this code.
`dict_tests.jl` is timing code that tests the insertion of 100,000
items, deletion of 100,000 items (with possible duplicates), and a
random mix of insertion and deletions. The weak point of the first
two versions is clearly deletion.
============================================================================================
OrderedDictv1.jl
============================================================================================
julia> require("dict_tests.jl")
julia> require("OrderedDictv1.jl")
julia> run_all()
test_ins
========
Dict{K,V}: [0.339021, 0.293294, 0.292359, 0.293016, 0.291899], median=0.293015884
OrderedDict{K,V}: [0.558897, 0.432915, 0.43435, 0.43405, 0.435232], median=0.43434986900000006
test_del
========
Dict{K,V}: [0.584101, 0.565818, 0.565554, 0.564916, 0.565354], median=0.5655537869999999
OrderedDict{K,V}: [0.797462, 0.785816, 0.781939, 0.783949, 0.784663], median=0.784663365
test_ins_del
============
Dict{K,V}: [0.61392, 0.612826, 0.611372, 0.612516, 0.613474], median=0.612826085
OrderedDict{K,V}: [0.829546, 0.827917, 0.828826, 0.828606, 0.831101], median=0.8288263260000001
============================================================================================
OrderedDictv3.jl
============================================================================================
julia> require("dict_tests.jl")
julia> require("OrderedDictv3.jl")
julia> run_all()
test_ins
========
Dict{K,V}: [0.323519, 0.284433, 0.282385, 0.281725, 0.281555], median=0.282384714
OrderedDict{K,V}: [0.489741, 0.380406, 0.37986, 0.380098, 0.37905], median=0.38009831499999996
test_del
========
Dict{K,V}: [0.594232, 0.573725, 0.574156, 0.57504, 0.574906], median=0.5749057240000001
OrderedDict{K,V}: [0.624253, 0.605593, 0.610013, 0.606928, 0.609852], median=0.609851745
test_ins_del
============
Dict{K,V}: [0.614838, 0.615072, 0.615307, 0.614056, 0.615215], median=0.61507241
OrderedDict{K,V}: [0.654655, 0.658411, 0.656661, 0.656376, 0.654963], median=0.6563759739999999
import Base.isempty, Base.length, Base.sizeof
import Base.start, Base.next, Base.done
import Base.has, Base.get
import Base.assign, Base.ref, Base.delete!, Base.empty!
import Base.show, Base.similar
import Base.findnextnot, Base.findfirstnot
type DictItem{K,V}
k::K
v::V
idx::Int
end
type OrderedDict{K,V} <: Associative{K,V}
ht::Dict{K,DictItem{K,V}}
v::Vector{DictItem{K,V}}
v_slots::BitArray
ndel::Int
OrderedDict() = new(Dict{K,DictItem{K,V}}(), similar(Array(DictItem{K,V},1), 0), BitArray(), 0)
function OrderedDict(ks,vs)
d = new(Dict{K,DictItem{K,V}}(), similar(Array(DictItem{K,V},1), 0), BitArray(), 0)
for (i, (k, v)) in enumerate(zip(ks, vs))
item = DictItem{K,V}(k,v,i)
d.ht[k] = item
push!(d.v, item)
push!(d.v_slots, true)
end
d
end
end
OrderedDict() = OrderedDict{Any, Any}()
OrderedDict(ks, vs) = OrderedDict{Any,Any}(ks, vs)
OrderedDict{K,V}(ks::AbstractArray{K}, vs::AbstractArray{V}) = OrderedDict{K,V}(ks, vs)
# syntax entry points
OrderedDict{K,V}(ks::(K...), vs::(V...)) = OrderedDict{K ,V }(ks, vs)
OrderedDict{K }(ks::(K...), vs::Tuple ) = OrderedDict{K ,Any}(ks, vs)
OrderedDict{V }(ks::Tuple , vs::(V...)) = OrderedDict{Any,V }(ks, vs)
similar{K,V}(d::OrderedDict{K,V}) = OrderedDict{K,V}()
# TODO: serialize, deserialize
## collections ##
isempty(d::OrderedDict) = isempty(d.ht)
length(d::OrderedDict) = length(d.ht) # d.v may have empty slots
has(d::OrderedDict, key) = has(d.ht, key)
## associative ##
get(d::OrderedDict, key, default) = has(d, key) ? d[key] : default
function empty!(d::OrderedDict)
empty!(d.ht)
empty!(d.v)
empty!(d.v_slots)
d.ndel = 0
end
## indexable ##
ref(d::OrderedDict, key) = d.ht[key].v
function assign{K,V}(d::OrderedDict{K,V}, v, key)
# 3/4 deleted?
if d.ndel >= ((3*length(d))>>2)
_compact(d)
end
if has(d, key)
d.ht[key].v = v
else
item = DictItem{K,V}(key, v, length(d.v)+1)
d.ht[key] = item
push!(d.v, item)
push!(d.v_slots, true)
end
d
end
## iterable ##
skip_deleted(d::OrderedDict, i) = findnext(d.v_slots, i)
start(d::OrderedDict) = skip_deleted(d,1)
done(d::OrderedDict, i) = (i == 0)
next(d::OrderedDict, i) = ((item=d.v[i]; (item.k, item.v)), skip_deleted(d,i+1))
## utility ##
function _compact(d::OrderedDict)
v = d.v
slots = d.v_slots
# start with first empty slot
s_pos = findfirstnot(slots)
if s_pos == 0
return
end
v_pos = findnext(slots, s_pos)
# fill down empty slots with consecutive filled slots
while v_pos != 0
item = v[s_pos] = v[v_pos]
item.idx = s_pos
s_pos += 1
v_pos = findnext(slots, v_pos+1)
end
new_sz = length(d)
resize!(d.v, new_sz)
resize!(d.v_slots, new_sz)
slots[1:end] = true
d.ndel = 0
nothing
end
## associative ##
function delete!(d::OrderedDict, key)
item = d.ht[key]
delete!(d.ht, key)
# Is this right?
ccall(:jl_arrayunset, Void, (Any, Uint), d.v, item.idx-1)
d.v_slots[item.idx] = false
d.ndel += 1
item.v
end
function delete!(d::OrderedDict, key, default)
if has(d, key)
delete!(d, key)
else
default
end
end
# Construction
import Base.similar, Base.sizehint
# Serialization
import Base.serialize, Base.deserialize
# Iteration
import Base.start, Base.next, Base.done
# General Collections
import Base.isempty, Base.empty!, Base.length
# Indexable Collections
import Base.setindex!, Base.getindex
# Associative Collections
import Base.has, Base.get, Base.getkey, Base.delete!
# Dequeue-like
import Base.push!, Base.pop!, Base.unshift!, Base.shift!, Base.append!, Base.insert!
# Useful, unexported by Base
import Base.findnextnot, Base.findfirstnot ## from bitarray.jl
import Base._tablesz, Base.hashindex ## from dict.jl
# Sorting
import Base.sort!, Base.sort, Base.sortby!, Base.sortby, Base.sortperm
import Sort.DEFAULT_UNSTABLE, Sort.DEFAULT_STABLE,
Sort.Ordering, Sort.Algorithm,
Sort.Forward, Sort.Reverse,
Sort.By, Sort.Lt, Sort.lt
# Exports
export OrderedDict,
similar,
sizehint,
start,
next,
done,
empty!,
getindex,
setindex!,
push!,
pop!,
unshift!,
shift!,
append!,
insert!,
sort,
sort!,
sortby,
sortby!,
sortperm,
indexof,
getitem
######################
## OrderedDict type ##
type OrderedDict{K,V} <: Associative{K,V}
slots::Array{Uint8,1}
keys::Array{K,1}
vals::Array{V,1}
ord_idxs::Array{Int,1}
ord_slots::BitArray
ord::Array{Int,1}
ndel::Int
odel::Int
count::Int
deleter::Function
function OrderedDict()
n = 16
new(zeros(Uint8,n), Array(K,n), Array(V,n), Array(Int,n), BitArray(), Array(Int,0), 0, 0, 0, identity)
end
function OrderedDict(ks, vs)
n = length(ks)
h = OrderedDict{K,V}()
for i=1:n
h[ks[i]] = vs[i]
end
return h
end
end
OrderedDict() = OrderedDict{Any,Any}()
OrderedDict{K,V}(ks::AbstractArray{K}, vs::AbstractArray{V}) = OrderedDict{K,V}(ks,vs)
OrderedDict(ks, vs) = OrderedDict{Any,Any}(ks, vs)
# syntax entry points
OrderedDict{K,V}(ks::(K...), vs::(V...)) = OrderedDict{K ,V }(ks, vs)
OrderedDict{K }(ks::(K...), vs::Tuple ) = OrderedDict{K ,Any}(ks, vs)
OrderedDict{V }(ks::Tuple , vs::(V...)) = OrderedDict{Any,V }(ks, vs)
##########################
## Construction-related ##
similar{K,V}(d::OrderedDict{K,V}) = OrderedDict{K,V}()
function sizehint(d::OrderedDict, newsz)
oldsz = length(d.slots)
if newsz <= oldsz
# todo: shrink
# be careful: rehash() assumes everything fits. it was only designed
# for growing.
return d
end
# grow at least 25%
newsz = max(newsz, (oldsz*5)>>2)
rehash(d, newsz)
end
## TODO: some of these are simply copied from base/dict.jl,
## and the type was changed from Dict -> OrderedDict
##
## (Field names might also be slightly different, but
## can be reverted.)
##
## It would be nice if they were defined in terms
## of an AbstractDict <: Associative
###################
## Serialization ##
# TODO: Remove these if AbstractDict versions become available
function serialize(s, t::OrderedDict)
serialize_type(s, typeof(t))
write(s, int32(length(t)))
for (k,v) in t
serialize(s, k)
serialize(s, v)
end
end
function deserialize{K,V}(s, T::Type{OrderedDict{K,V}})
n = read(s, Int32)
t = T(); sizehint(t, n)
for i = 1:n
k = deserialize(s)
v = deserialize(s)
t[k] = v
end
return t
end
########################################
## Dict/OrderedDict Utility functions ##
# TODO: Remove these if AbstractDict versions become available
isslotempty(h::OrderedDict, i::Int) = h.slots[i] == 0x0
isslotfilled(h::OrderedDict, i::Int) = h.slots[i] == 0x1
isslotmissing(h::OrderedDict, i::Int) = h.slots[i] == 0x2
# OrderedDict version of rehash
function rehash{K,V}(h::OrderedDict{K,V}, newsz)
newsz = _tablesz(newsz)
nel = h.count
h.ndel = h.count = 0
if nel == 0
resize!(h.slots, newsz)
fill!(h.slots, 0)
resize!(h.keys, newsz)
resize!(h.vals, newsz)
resize!(h.ord_idxs, newsz)
resize!(h.ord_slots, 0)
resize!(h.ord, 0)
return h
end
olds = h.slots
oldk = h.keys
oldv = h.vals
oldi = h.ord_idxs
sz = length(olds)
h.slots = zeros(Uint8,newsz)
h.keys = Array(K, newsz)
h.vals = Array(V, newsz)
h.ord_idxs = Array(Int, newsz)
for i = 1:sz
if olds[i] == 0x1
k = oldk[i]
index = hashindex(k, newsz)
while h.slots[index] != 0
index = (index & (newsz-1)) + 1
end
h.slots[index] = 0x1
h.keys[index] = k
h.vals[index] = oldv[i]
idx = oldi[i]
h.ord_idxs[index] = idx
h.ord[idx] = index
h.count += 1
end
end
return h
end
# get the index where a key is stored, or -1 if not present
# TODO: remove if AbstractDict version becomes available
function ht_keyindex{K,V}(h::OrderedDict{K,V}, key)
sz = length(h.keys)
iter = 0
maxprobe = max(16, sz>>6)
index = hashindex(key, sz)
orig = index
keys = h.keys
while true
if isslotempty(h,index)
break
end
if !isslotmissing(h,index) && isequal(key,keys[index])
return index
end
index = (index & (sz-1)) + 1
iter+=1
if iter > maxprobe || index==orig
break
end
end
return -1
end
# Removes empty slots of order array in OrderedDict
function _compact(h::OrderedDict)
# start with first empty slot
npos = findfirstnot(h.ord_slots)
if npos == 0 return; end
opos = findnext(h.ord_slots, npos)
# fill down empty slots with consecutive filled slots
while opos != 0
index = h.ord[npos] = h.ord[opos]
h.ord_idxs[index] = npos
npos += 1
opos = findnext(h.ord_slots, opos+1)
end
resize!(h.ord, h.count)
resize!(h.ord_slots, h.count)
h.ord_slots[1:end] = true
h.odel = 0
nothing
end
###############
## Iteration ##
skip_deleted(t::OrderedDict, i::Int) = findnext(t.ord_slots, i)
start(t::OrderedDict) = skip_deleted(t,1)
done(t::OrderedDict, i) = (i == 0)
next(t::OrderedDict, i) = (idx = t.ord[i]; ((t.keys[idx],t.vals[idx]), skip_deleted(t,i+1)))
#########################
## General Collections ##
isempty(t::OrderedDict) = (t.count == 0)
length(t::OrderedDict) = t.count
function empty!{K,V}(h::OrderedDict{K,V})
fill!(h.slots, 0x0)
sz = length(h.slots)
h.keys = Array(K, sz)
h.vals = Array(V, sz)
h.ord_idxs = Array(Int, sz)
h.ord_slots = BitArray()
h.ord = Array(Int, 0)
h.ndel = 0
h.odel = 0
h.count = 0
return h
end
###########################
## Indexable Collections ##
# As with ref, we want to allow assignment by index position as well, as key,
# but we ignore this when K<:Number
function setindex!{K,V}(h::OrderedDict{K,V}, kv::(Any,Any), index::Integer)
(key,v) = kv
ord_idx = indexof(h,key,0)
if ord_idx == index
return setindex!(h, v, key)
end
# TODO: this can be more efficient
delete!(h, h.keys[h.ord[index]])
insert!(h, index, kv)
end
setindex!{K<:Number,V}(h::OrderedDict{K,V}, v::(Any,Any), key::Integer) =
invoke(setindex!, (OrderedDict{K,V}, Any, Any), h, v, key)
setindex!{K<:Number,V}(h::OrderedDict{K,V}, v, key::Integer) =
invoke(setindex!, (OrderedDict{K,V}, Any, Any), h, v, key)
function setindex!{K,V}(h::OrderedDict{K,V}, v, key)
key = convert(K,key)
v = convert(V, v)
sz = length(h.keys)
if h.ndel >= ((3*sz)>>2) || h.count*3 > sz*2
# > 3/4 deleted or > 2/3 full
rehash(h, h.count > 64000 ? h.count*2 : h.count*4)
sz = length(h.keys) # rehash may resize the table at this point!
end
if h.odel >= ((3*length(h.ord))>>2)
# > 3/4 of ord array deleted
_compact(h)
end
iter = 0
maxprobe = max(16, sz>>6)
index = hashindex(key, sz)
orig = index
avail = -1 # an available slot
#keys = h.keys; vals = h.vals; ord_idxs = h.ord_idxs; ord_slots = h.ord_slots; ord = h.ord
while true
if isslotempty(h,index)
if avail > 0; index = avail; end
push!(h.ord, index)
push!(h.ord_slots, true)
h.slots[index] = 0x1
h.keys[index] = key
h.vals[index] = v
h.ord_idxs[index] = length(h.ord)
h.count += 1
return h
end
if isslotmissing(h,index)
if avail<0
# found an available slot, but need to keep scanning
# in case "key" already exists in a later collided slot.
avail = index
end
elseif isequal(key, h.keys[index])
h.vals[index] = v
return h
end
index = (index & (sz-1)) + 1
iter+=1
if iter > maxprobe || index==orig
break
end
end
if avail>0
index = avail
push!(h.ord, index)
push!(h.ord_slots, true)
h.slots[index] = 0x1
h.keys[index] = key
h.vals[index] = v
h.ord_idxs[index] = length(h.ord)
h.count += 1
return h
end
rehash(h, h.count > 64000 ? sz*2 : sz*4)
setindex!(h, v, key)
end
# We want to allow the user to access the (k,v) pairs by index
# However, first and foremost, this is a dictionary, so if the
# keys are numbers, assume any reference using an integer
# as a key is attempting a has lookup.
# TODO: This might be confusing behavior, so consider disabling
# and simply sequential access through getitem()
getindex{K,V}(h::OrderedDict{K,V}, ord_idx::Integer) = getitem(h, ord_idx)
function getindex{K<:Number,V}(h::OrderedDict{K,V}, key::Integer)
index = ht_keyindex(h, key)
return (index<0) ? throw(KeyError(key)) : h.vals[index]::V
end
function getindex{K,V}(h::OrderedDict{K,V}, key)
index = ht_keyindex(h, key)
return (index<0) ? throw(KeyError(key)) : h.vals[index]::V
end
function indexof{K,V}(h::OrderedDict{K,V}, key)
index = ht_keyindex(h, key)
return (index<0) ? throw(KeyError(key)) : (_compact(h); h.ord_idxs[index])
end
function indexof{K,V}(h::OrderedDict{K,V}, key, deflt)
index = ht_keyindex(h, key)
return (index<0) ? deflt : (_compact(h); h.ord_idxs[index])
end
#############################
## Associative Collections ##
has(h::OrderedDict, key) = (ht_keyindex(h, key) >= 0)
function get{K,V}(h::OrderedDict{K,V}, key, deflt)
index = ht_keyindex(h, key)
return (index<0) ? deflt : h.vals[index]::V
end
function getkey{K,V}(h::OrderedDict{K,V}, key, deflt)
index = ht_keyindex(h, key)
return (index<0) ? deflt : h.keys[index]::K
end
function getitem{K,V}(h::OrderedDict{K,V}, ord_idx)
_compact(h)
index = h.ord[ord_idx]
return (h.keys[index], h.vals[index])::(K,V)
end
function _delete!(h::OrderedDict, index)
val = h.vals[index]
idx = h.ord_idxs[index]
h.slots[index] = 0x2
ccall(:jl_arrayunset, Void, (Any, Uint), h.keys, index-1)
ccall(:jl_arrayunset, Void, (Any, Uint), h.vals, index-1)
# h.ord_idxs[index] = 0 ## not really necessary
# ord[idx] = 0 ## not really necessary
h.ord_slots[idx] = false
h.ndel += 1
h.odel += 1
h.count -= 1
return val
end
function delete!{K,V}(h::OrderedDict{K,V}, key)
index = ht_keyindex(h, key)
index > 0 ? _delete!(h, index) : throw(KeyError(key))
end
function delete!{K,V}(h::OrderedDict{K,V}, ord_idx::Integer)
key = h.keys[h.ord[ord_idx]]
(key, delete!(h, key))::(K,V)
end
delete!{K<:Number,V}(h::OrderedDict{K,V}, key::Integer) = invoke(delete!, (OrderedDict{K,V}, Any), h, key)
function delete!(h::OrderedDict, key, default)
index = ht_keyindex(h, key)
index > 0 ? _delete!(h, index) : default
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment