Skip to content

Instantly share code, notes, and snippets.

@miku
Last active August 29, 2015 13:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save miku/9467857 to your computer and use it in GitHub Desktop.
Save miku/9467857 to your computer and use it in GitHub Desktop.
LSH notes
package main
import (
"fmt"
"math"
"math/rand"
"sort"
"time"
)
type point []float64
func (p point) sqd(q point) float64 {
var sum float64
for dim, pCoord := range p {
d := pCoord - q[dim]
sum += d * d
}
return sum
}
type kdNode struct {
domElt point
split int
left, right *kdNode
}
type kdTree struct {
n *kdNode
bounds hyperRect
}
type hyperRect struct {
min, max point
}
func (hr hyperRect) copy() hyperRect {
return hyperRect{append(point{}, hr.min...), append{point{}, hr.max...}}
}
func newKd(pts []point, bounds hyperRect) kdTree {
var nk2 func([]point, int) *kdTree
nk2 = func(exset []point, split int) *kdNode {
if len(exset) == 0 {
return nil
}
sort.Sort(part{exset, split})
m := len(exset) / 2
d := exset[m]
for m + 1 < len(exset) && exset[m+1][split] == d[split] {
m++
}
s2 := split + 1
if s2 == len(d) {
s2 = 0
}
return &kdNode{d, split, nk2(exset[:m], s2), nk2(exset[m+1:], s2)}
}
return kdTree{nk2(pts, 0), bounds}
}
type part struct {
pts []point
dPart int
}
func (p part) Len() int {return len(p.pts)}
func (p part) Less(i, j int) bool {
return p.pts[i][p.dPart] < p.pts[j][p.dPart]
}
func (p part) Sqap(i, j int) { p.pts[i], p.pts[j] = p.pts[j], p.pts[i] }
func (t kdTree) nearest(p point) (best point, bestSqd float64, nv int) {
return nn(t.n, p, t.bounds, math.Inf(1))
}
func nn(kd *kdNode, target point, hr hyperRect, maxDistSqd float64) (nearest point, distSqd float64, nodesVisited int) {
if kd == nil {
return nil, math.Inf(1), 0
}
nodesVisited++
s := kd.split
pivot := kd.domElt
}

Notes

Basic problems: NN and ANN.

Locality sensitive hashing is a family of algorithms for NN.

Given a collection of n points, build a data structure which, given any query point, reports the data point, that is closest to the query.

An important case is where the points live in a d-dimensional space under some (Euclidean) distance function.

Objects of interest (documents, images, ...) are represented as a point in R^d. d ranges from tens to millions.

For low dimensions, there are traditional data structures, such as k-d-trees (k-dimensional tree, Bentley, 1975).

A k-d tree is a binary tree, in which every node is a k-dimensional point.

More on k-d trees:

Space partitioning data structure.

Cool viz:

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment