Last active
May 29, 2016 21:43
-
-
Save cbarrick/67adf9fdb4e884ae514de56c164294b2 to your computer and use it in GitHub Desktop.
An experiment to combine hashing with a floating point address space
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// This is an experiment to combine hashing with a floating point address space. | |
// | |
// # The Problem: | |
// | |
// In a binary search tree, we wish to assign each node a unique address such | |
// that nodes which sort to the right have higher addresses than nodes that sort | |
// to the left. We want to generate the address when the node is inserted, and | |
// we want addresses to be stable, meaning that new insertions will not effect | |
// existing addresses. We also want finite bounds on the address space. | |
// | |
// A secondary goal is that we want the low bits to exhibit entropy so that | |
// assigning addresses can also serve as a hash function. | |
// | |
// # The Naïve Solution: | |
// | |
// One obvious solution is to generate an address for each new node equal to the | |
// average of the addresses of the nodes to the immediate left and right at | |
// insertion time. | |
// | |
// We can assess this algorithm as an address generator by reasoning about the | |
// size of the address space. The address space is fundamentally limited by the | |
// number of bits in the address representation. In the IEEE 754 binary floating | |
// point formats, multiplying and dividing a number by two only perturbs the | |
// minimum possible number of bits. Thus this solution provides the largest | |
// address space. | |
// | |
// However, this algorithm serves as a poor hash function for the same reason | |
// that makes it a good address generator: very few bits change between | |
// addresses. | |
// | |
// See: https://en.wikipedia.org/wiki/Double-precision_floating-point_format | |
// | |
// # Our Solution | |
// | |
// We extend the naïve solution to a weighted average and achieve a better hash | |
// function. We use the formula `x*w + y*(1-w)` to generate an address between | |
// x and y given some weight w. Note that a weight of 1/2 is the same as the | |
// naïve solution. | |
// | |
// Weights of 1/p where p is prime and greater than 2 perform best as a hash | |
// function. However, the size of the address space decreases rapidly as p | |
// increases because of the requirement for more significant bits. Thus p=3 | |
// seems to be the best balance of entropy and space. Empirical evidence | |
// suggests the tree can safely grow to a depth greater than 32 and the | |
// addresses contain enough entropy that I am comfortable using the low bits as | |
// bucket numbers in hash tables. | |
// | |
// # Future Work | |
// | |
// All of these results we derived empirically, and more theoretical work would | |
// be appreciated. I would love to have a formula for the maximum depth of the | |
// tree given the weight, as well as a proof that this method is free from | |
// address collisions. | |
package main | |
import ( | |
"fmt" | |
"math" | |
"math/rand" | |
"unsafe" | |
) | |
const ( | |
weight = 1.0 / 3 | |
trials = 256 // number of walks in the experiment | |
maxDepth = 32 // depth of each walk | |
histDepth = 4 // only addresses deeper than this are recorded in the histogram | |
) | |
func addr(lo, hi float64) float64 { | |
return lo*(weight) + hi*(1-weight) | |
} | |
// The main experiment is simple. We take random walks through the address space | |
// and record the path we took to reach that address. If we encounter the same | |
// address from a previous walk, we verify that it is on the same path. We | |
// record the hash value of deep nodes into a histogram to verify entropy. | |
func main() { | |
var ( | |
lo, hi float64 // address bounds of the current subtree | |
x float64 // root of the current subtree | |
hist [32]int // histogram for checking entropy | |
// We record the path to each address we encounter. A path is a list of | |
// decisions to go left or right. If the same address can be reach by | |
// multiple paths, our method is incorrect. | |
paths = make(map[float64][]bool) | |
) | |
fmt.Println("depth\taddress \tbits \thash") | |
for i := 0; i < trials; i++ { | |
lo, hi = 0, 1 | |
x = lo/2 + hi/2 | |
var path []bool | |
for i := 0; i < maxDepth; i++ { | |
// Check that the current subtree has not been visited by any other path. | |
seen := paths[x] | |
if seen == nil { | |
paths[x] = path | |
} else { | |
if len(seen) != len(path) { | |
panic(fmt.Errorf("%v visited by two paths:\n\t%v and\n\t%v", x, seen, path)) | |
} | |
for i := range path { | |
if path[i] != seen[i] { | |
panic(fmt.Errorf("%v visited by two paths:\n\t%v and\n\t%v", x, seen, path)) | |
} | |
} | |
} | |
// Record the low bits of the current node into the histogram. | |
// We only record nodes from long paths, because there are very few | |
// short paths and thus the same short paths occur at the beginning | |
// of most trials. | |
bits := *(*uint64)(unsafe.Pointer(&x)) | |
hash := bits % uint64(len(hist)) | |
if i > histDepth { | |
hist[hash]++ | |
} | |
fmt.Printf("%d\t%0.13f\t%0.64b\t%d\n", i, x, bits, hash) | |
// Decide which path to take and generate the next address. | |
if rand.Intn(2) == 1 { | |
path = append(path, true) | |
lo = x | |
x = addr(x, hi) | |
} else { | |
path = append(path, false) | |
hi = x | |
x = addr(x, lo) | |
} | |
} | |
} | |
fmt.Println() | |
fmt.Printf("Hash distribution for nodes deeper than %v:\n", histDepth) | |
fmt.Println(hist) | |
var mean float64 | |
var variance float64 | |
for i := range hist { | |
mean += float64(hist[i]) / float64(len(hist)) | |
} | |
for i := range hist { | |
x := float64(hist[i]) | |
variance += (x - mean) * (x - mean) / float64(len(hist)) | |
} | |
var sd = math.Sqrt(variance) | |
fmt.Println("mean:\t", mean) | |
fmt.Println("sd:\t", sd) | |
for i := range hist { | |
x := float64(hist[i]) | |
if math.Abs(x-mean) > 2*sd { | |
panic("possibly biased distribution") | |
} | |
} | |
fmt.Println() | |
fmt.Println("SUCCESS") | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment