Skip to content

Instantly share code, notes, and snippets.

@death
Created September 4, 2018 09:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save death/f6e0ef136dbe00b83dc716ae4db766ab to your computer and use it in GitHub Desktop.
Save death/f6e0ef136dbe00b83dc716ae4db766ab to your computer and use it in GitHub Desktop.
MinHash example
(defpackage #:snippets/minhash
(:documentation
"On removing duplicates from a set of documents
http://stevehanov.ca/blog/index.php?id=144")
(:use #:cl)
(:import-from #:ppcre
#:split)
(:shadowing-import-from #:fset
#:convert
#:set
#:less
#:size
#:intersection
#:union
#:reduce
#:with
#:lookup
#:empty-map
#:empty-set
#:do-set)
(:import-from #:alexandria
#:shuffle
#:iota
#:when-let
#:minf))
(in-package #:snippets/minhash)
(defun words (text)
(less (convert 'set (split "[^a-zA-Z]" text)) ""))
(defun jaccard (set1 set2)
(- 1
(/ (size (intersection set1 set2))
(size (union set1 set2)))))
(defun sum (collection)
(reduce #'+ collection :initial-value 0))
(defun make-hash-fns (corpus k)
(let ((orderings (loop repeat k collect (shuffle (iota (size corpus))))))
(reduce (lambda (words word)
(prog1 (with words word (map 'vector #'first orderings))
(setf orderings (mapcar #'rest orderings))))
corpus
:initial-value (empty-map))))
(defun minhash (corpus &optional (k 5))
(let ((hash-fns (make-hash-fns corpus k)))
(lambda (document)
;; For each hash function, find the lowest value word in the
;; document.
(let ((vals (make-array k :initial-element most-positive-fixnum)))
(do-set (word document)
(when-let (indices (lookup hash-fns word))
(dotimes (i k)
(minf (aref vals i)
(aref indices i)))))
;; The mean average is a bit crappy with small k and corpus.
(/ (sum vals) k)))))
(defun corpus (documents)
(reduce (lambda (corpus document)
(union corpus (words document)))
documents
:initial-value (empty-set)))
(defun documents ()
(list "Eenie, meenie, miney, mo
Catch a bad chick by her toe
If she holla
If, if, if she holla, let her go"
"Eenie, meenie, miny, mo
Catch a bad chick by her toe
If she holla
If, if, if she holla, let her go"
"Fly me to the moon
Let me play among the stars
Let me see what spring is like on
A-Jupiter and Mars"))
(defun word-sets (documents)
(mapcar #'words documents))
(defun test (&optional (documents (documents)) (k 5))
(mapcar (minhash (corpus documents) k)
(word-sets documents)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment