Created
September 4, 2018 09:57
-
-
Save death/f6e0ef136dbe00b83dc716ae4db766ab to your computer and use it in GitHub Desktop.
MinHash example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(defpackage #:snippets/minhash | |
(:documentation | |
"On removing duplicates from a set of documents | |
http://stevehanov.ca/blog/index.php?id=144") | |
(:use #:cl) | |
(:import-from #:ppcre | |
#:split) | |
(:shadowing-import-from #:fset | |
#:convert | |
#:set | |
#:less | |
#:size | |
#:intersection | |
#:union | |
#:reduce | |
#:with | |
#:lookup | |
#:empty-map | |
#:empty-set | |
#:do-set) | |
(:import-from #:alexandria | |
#:shuffle | |
#:iota | |
#:when-let | |
#:minf)) | |
(in-package #:snippets/minhash) | |
(defun words (text) | |
(less (convert 'set (split "[^a-zA-Z]" text)) "")) | |
(defun jaccard (set1 set2) | |
(- 1 | |
(/ (size (intersection set1 set2)) | |
(size (union set1 set2))))) | |
(defun sum (collection) | |
(reduce #'+ collection :initial-value 0)) | |
(defun make-hash-fns (corpus k) | |
(let ((orderings (loop repeat k collect (shuffle (iota (size corpus)))))) | |
(reduce (lambda (words word) | |
(prog1 (with words word (map 'vector #'first orderings)) | |
(setf orderings (mapcar #'rest orderings)))) | |
corpus | |
:initial-value (empty-map)))) | |
(defun minhash (corpus &optional (k 5)) | |
(let ((hash-fns (make-hash-fns corpus k))) | |
(lambda (document) | |
;; For each hash function, find the lowest value word in the | |
;; document. | |
(let ((vals (make-array k :initial-element most-positive-fixnum))) | |
(do-set (word document) | |
(when-let (indices (lookup hash-fns word)) | |
(dotimes (i k) | |
(minf (aref vals i) | |
(aref indices i))))) | |
;; The mean average is a bit crappy with small k and corpus. | |
(/ (sum vals) k))))) | |
(defun corpus (documents) | |
(reduce (lambda (corpus document) | |
(union corpus (words document))) | |
documents | |
:initial-value (empty-set))) | |
(defun documents () | |
(list "Eenie, meenie, miney, mo | |
Catch a bad chick by her toe | |
If she holla | |
If, if, if she holla, let her go" | |
"Eenie, meenie, miny, mo | |
Catch a bad chick by her toe | |
If she holla | |
If, if, if she holla, let her go" | |
"Fly me to the moon | |
Let me play among the stars | |
Let me see what spring is like on | |
A-Jupiter and Mars")) | |
(defun word-sets (documents) | |
(mapcar #'words documents)) | |
(defun test (&optional (documents (documents)) (k 5)) | |
(mapcar (minhash (corpus documents) k) | |
(word-sets documents))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment