Created
July 9, 2013 15:19
-
-
Save sile/5958231 to your computer and use it in GitHub Desktop.
locality sensitive hashingの実装メモ
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; paper: http://www.isi.edu/natural-language/people/ravichan/papers/clustering.pdf | |
(defun cos-sim (u v &key product length) | |
(/ (funcall product u v) | |
(* (funcall length u) (funcall length v)))) | |
(defun h.r (r u) | |
(if (>= (funcall r u) 0) | |
1 | |
0)) | |
;; http://grepcode.com/file/repo1.maven.org/maven2/cc.ivory/ivory/0.7.3/ivory/lsh/projection/WriteRandomVectors.java?av=f | |
(defun generate-unit-random-vector (num-samples) | |
(let ( | |
(vector (make-array num-samples :initial-element 0)) | |
(normalization-factor 0) | |
) | |
(loop FOR i FROM 0 BELOW num-samples | |
DO | |
(multiple-value-bind (x y r) | |
(loop FOR x = (- (* 2.0 (random 1.0)) 1.0) | |
FOR y = (- (* 2.0 (random 1.0)) 1.0) | |
FOR r = (+ (* x x) (* y y)) | |
WHILE (or (> r 1) (= r 0)) | |
FINALLY | |
(return (values x y r))) | |
(declare (ignore y)) | |
(let ((f (* x (sqrt (- 2.0 (/ (log r) r)))))) | |
(incf normalization-factor (expt f 2)) | |
(setf (aref vector i) f)))) | |
;; normalize vector | |
(setf normalization-factor (sqrt normalization-factor)) | |
(loop FOR i FROM 0 BELOW num-samples | |
FOR val = (aref vector i) | |
FOR newf = (/ val normalization-factor) | |
DO | |
(setf (aref vector i) newf)) | |
(coerce vector 'list))) | |
(defun calc (input d q b thres) | |
(let* ((samples (length (car input))) | |
(rs (loop REPEAT d COLLECT (generate-unit-random-vector samples))) | |
(bits (loop FOR v IN input | |
COLLECT | |
(coerce | |
(loop FOR r IN rs | |
COLLECT (calc-h-r v r)) | |
'list #|'bit-vector|#)))) | |
(let ((bits-perm | |
(loop REPEAT q | |
FOR perm-table = (make-perm-table d) | |
COLLECT | |
(loop FOR i FROM 0 | |
FOR bit IN bits | |
COLLECT (list i (rotate-bit perm-table bit)))))) | |
(let ((sorted-bits-perm | |
(loop FOR entry IN bits-perm | |
FOR sorted = (sort entry (lambda (as bs) | |
(loop FOR a IN as | |
FOR b IN bs | |
DO | |
(cond ((< a b) | |
(return t)) | |
((> a b) | |
(return nil))) | |
FINALLY | |
(return nil))) | |
:key #'second) | |
COLLECT sorted))) | |
(let ((m (make-hash-table))) | |
(loop FOR list IN sorted-bits-perm | |
DO | |
(mapl (lambda (cdr) | |
(destructuring-bind ((i bits) . cdr) cdr | |
(let ((sims (collect-sim bits cdr b thres))) | |
(setf (gethash i m) | |
(delete-duplicates (append sims (gethash i m)) | |
:key #'car))))) | |
list)) | |
m)) | |
))) | |
(defun collect-sim (as list b thres &optional (acc '())) | |
(if (or (= b 0) | |
(null list)) | |
(nreverse acc) | |
(destructuring-bind ((i bs) . rest) list | |
(let ((p (pr as bs))) | |
(if (>= p thres) | |
(collect-sim as rest (1- b) thres (cons `(,i ,bs ,p) acc)) | |
(collect-sim as rest (1- b) thres acc)))))) | |
(defun hamming-distance (as bs) | |
(loop WITH mismatch-count = 0 | |
FOR a IN as | |
FOR b IN bs | |
DO | |
(when (/= a b) | |
(incf mismatch-count)) | |
FINALLY | |
(return mismatch-count))) | |
(defun pr (as bs) | |
(- 1 (/ (hamming-distance as bs) (length as)))) | |
(defun shuffle (ary) | |
(dotimes (i (length ary) ary) | |
(rotatef (aref ary i) (aref ary (random (length ary)))))) | |
(defun make-perm-table (size) | |
(shuffle (coerce (loop FOR i FROM 0 BELOW size COLLECT i) 'vector))) | |
(defun rotate-bit (perm-table bit-list) | |
(loop FOR i FROM 0 BELOW (length perm-table); | |
FOR j = (aref perm-table i) | |
DO | |
(rotatef (elt bit-list i) (elt bit-list j))) | |
bit-list) | |
(defun calc-h-r (v r) | |
(let ((p (product v r))) | |
(if (>= p 0) | |
1 | |
0))) | |
(defun product (v r) | |
(assert (= (length v) (length r))) | |
(reduce #'+ (mapcar #'* v r))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment