Last active
January 11, 2019 05:30
-
-
Save alnsn/68f599bc9358fcee122d6175392d779f to your computer and use it in GitHub Desktop.
Lua module for BDZ perfect hash
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Building block for generating a perfect not order preserving | |
-- non-minimal hash. | |
-- | |
-- For a keyset of size N, it builds a random bipartite graph of | |
-- size N and order 2 * math.floor(1.0625 * N). | |
-- With a good probability the graph is acyclic and it's possible | |
-- to select one vertex from each edge/key that is guaranteed to | |
-- be unique. Vertices are selected by comparing two bits in | |
-- a bitset that has the same size as the order of the graph. | |
-- | |
-- The module comes with a selftest and a small demo function. | |
-- | |
-- References: | |
-- | |
-- http://cmph.sourceforge.net/bdz.html | |
-- | |
-- Cache-Oblivious Peeling of Random Hypergraphs by Djamal Belazzougui, | |
-- Paolo Boldi, Giuseppe Ottaviano, Rossano Venturini, and Sebastiano | |
-- Vigna. | |
-- http://zola.di.unipi.it/rossano/wp-content/papercite-data/pdf/dcc14.pdf | |
-- | |
-- See also https://github.com/alnsn/rgph | |
local bit = bit32 or bit or require "bit" | |
-- Array of oriented edges { { edge=e, vert=v, degree=d }, ... } | |
-- is stored in a flat array { e, v, d, ... } for better performance. | |
local xored_edge = 1 | |
local xored_vert = 2 | |
local oedge_degree = 3 | |
local function add_remove_oedge(oedges, delta, e, v1, v2) | |
oedges[3*v1+xored_edge] = bit.bxor(oedges[3*v1+xored_edge], e) | |
oedges[3*v1+xored_vert] = bit.bxor(oedges[3*v1+xored_vert], v2) | |
oedges[3*v1+oedge_degree] = oedges[3*v1+oedge_degree] + delta | |
end | |
local function add_edge(oedges, e, v1, v2) | |
add_remove_oedge(oedges, 1, e, v1, v2) | |
add_remove_oedge(oedges, 1, e, v2, v1) | |
end | |
local function remove_vertex(oedges, v1, order, top) | |
if oedges[3*v1+oedge_degree] == 1 then | |
oedges[3*v1+oedge_degree] = 0 | |
local e = oedges[3*v1+xored_edge] | |
local v2 = oedges[3*v1+xored_vert] | |
add_remove_oedge(oedges, -1, e, v2, v1) | |
order[top] = e | |
top = top - 1 | |
end | |
return top | |
end | |
local function peel(edges, nkeys, oedges, nverts, order) | |
local top = nkeys | |
for i = 1, nverts do | |
local v1 = i - 1 | |
top = remove_vertex(oedges, v1, order, top) | |
end | |
local i = nkeys | |
while i > top do | |
local o = order[i] | |
top = remove_vertex(oedges, edges[2*o+1], order, top) | |
top = remove_vertex(oedges, edges[2*o+2], order, top) | |
i = i - 1 | |
end | |
return top | |
end | |
local graph_mt = {} | |
graph_mt.__index = graph_mt | |
local function new_graph(nkeys) | |
local nverts = nkeys < 12 and 24 or 2 * math.floor(1.0625 * nkeys) | |
if nkeys <= 0 or nverts >= 2^32 then return nil, "out of range" end | |
local g = { nkeys = nkeys, nverts = nverts, | |
edges = { 0 }, oedges = { 0 }, order = { -1 } } | |
return setmetatable(g, graph_mt) | |
end | |
function graph_mt:entries() | |
return self.nkeys | |
end | |
function graph_mt:vertices() | |
return self.nverts | |
end | |
function graph_mt:edge(e) | |
local edges = self.edges | |
return edges[2*e+1], edges[2*e+2] | |
end | |
function graph_mt:build(seed, hash, reduce, iter, state, key) | |
local oedges = self.oedges | |
local nverts = self.nverts | |
for i = 1, 3*nverts do | |
oedges[i] = 0 | |
end | |
local edges = self.edges | |
local nkeys = self.nkeys | |
for i = 1, 2*nkeys do | |
edges[i] = 0 | |
end | |
local partsz = math.floor(nverts / 2) | |
assert(nverts == 2 * partsz) | |
for i = 1, nkeys do | |
key = iter(state, key) | |
if key == nil then return nil, "no key" end | |
local e = i - 1 | |
local h1, h2 = hash(key, seed) | |
local v1, v2 = reduce(partsz, h1, h2) | |
edges[2*e+1] = v1 | |
edges[2*e+2] = v2 | |
add_edge(oedges, e, v1, v2) | |
end | |
local order = self.order | |
if #order ~= nkeys then | |
for i = 1, nkeys do | |
order[i] = -1 | |
end | |
end | |
local core = peel(edges, nkeys, oedges, nverts, order) | |
self.core = core | |
return core == 0 or nil, core ~= 0 and "cycle" or nil | |
end | |
function graph_mt:assign(assignments) | |
if not self.core or self.core ~= 0 then | |
return nil, "not built" | |
end | |
local unassigned = 2 | |
assignments = assignments or {} | |
local nverts = self.nverts | |
for i = 1, nverts do | |
assignments[i] = unassigned | |
end | |
local nkeys = self.nkeys | |
local edges = self.edges | |
local order = self.order | |
for i = 1, nkeys do | |
local e = order[i] | |
local v1 = edges[2*e+1] + 1 | |
local v2 = edges[2*e+2] + 1 | |
if assignments[v2] == unassigned then | |
assignments[v2] = (1 - assignments[v1]) % unassigned | |
end | |
if assignments[v1] == unassigned then | |
assignments[v1] = (0 - assignments[v2]) % unassigned | |
end | |
end | |
return assignments | |
end | |
-- selftest | |
if not ... then | |
-- Equivalent to mi_vector_hash(htole32(value), 4, seed, h) on NetBSD. | |
local function jenkins2v_u32(value, seed) | |
local mod = 2^32 | |
local h1 = (0x9e3779b9 + value) % mod | |
local h2 = 0x9e3779b9 | |
local h3 = (seed + 4) % mod | |
-- mix | |
h1 = (mod + mod + h1 - h2 - h3) % mod | |
h1 = (mod + bit.bxor(h1, bit.rshift(h3, 13))) % mod | |
h2 = (mod + mod + h2 - h3 - h1) % mod | |
h2 = (mod + bit.bxor(h2, bit.lshift(h1, 8))) % mod | |
h3 = (mod + mod + h3 - h1 - h2) % mod | |
h3 = (mod + bit.bxor(h3, bit.rshift(h2, 13))) % mod | |
h1 = (mod + mod + h1 - h2 - h3) % mod | |
h1 = (mod + bit.bxor(h1, bit.rshift(h3, 12))) % mod | |
h2 = (mod + mod + h2 - h3 - h1) % mod | |
h2 = (mod + bit.bxor(h2, bit.lshift(h1, 16))) % mod | |
h3 = (mod + mod + h3 - h1 - h2) % mod | |
h3 = (mod + bit.bxor(h3, bit.rshift(h2, 5))) % mod | |
h1 = (mod + mod + h1 - h2 - h3) % mod | |
h1 = (mod + bit.bxor(h1, bit.rshift(h3, 3))) % mod | |
h2 = (mod + mod + h2 - h3 - h1) % mod | |
h2 = (mod + bit.bxor(h2, bit.lshift(h1, 10))) % mod | |
h3 = (mod + mod + h3 - h1 - h2) % mod | |
h3 = (mod + bit.bxor(h3, bit.rshift(h2, 15))) % mod | |
return h1, h2, h3 | |
end | |
local function mod_reduce(sz, h1, h2) | |
return h1 % sz, (h2 % sz) + sz | |
end | |
local keys = { | |
-- Values don't matter. | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
} | |
-- This particular seed value generates an acyclic graph. | |
-- In general, random seeds should be passed repeatedly | |
-- until the build succeeds. | |
local seed = 2 | |
local g = new_graph(#keys) | |
assert(g:build(seed, jenkins2v_u32, mod_reduce, ipairs(keys))) | |
local assignments = assert(g:assign()) | |
local unassigned = 2 | |
local duplicates = {} | |
for e = 0, g:entries() - 1 do | |
local v1, v2 = g:edge(e) | |
assert(assignments[v1+1] < unassigned, "must be assigned") | |
assert(assignments[v2+1] < unassigned, "must be assigned") | |
-- Select v1 or v2. | |
local eq = assignments[v1+1] == assignments[v2+1] | |
local h = eq and v1 or v2 | |
assert(not duplicates[h], "hashes must be unique") | |
duplicates[h] = true | |
end | |
local partition_size = g:vertices() / 2 | |
-- demo | |
local function perfect_hash(value) | |
local h1, h2 = jenkins2v_u32(value, seed) | |
local v1, v2 = mod_reduce(partition_size, h1, h2) | |
local eq = assignments[v1+1] == assignments[v2+1] | |
return eq and v1 or v2 | |
end | |
for i, _ in ipairs(keys) do | |
print(perfect_hash(i)) | |
end | |
end | |
return { new_graph = new_graph } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment