Created
April 8, 2020 16:05
-
-
Save death/84c57eb0811421c1c51bf63f2fd716fd to your computer and use it in GitHub Desktop.
tfidf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; Suppose we have a collection of N documents. | |
;; Define Fij to be the frequency of term i in document j. | |
;; Define TFij to be Fij/max(k,Fkj), | |
;; that is, the term frequency of term i in document j is Fij | |
;; normalized by dividing it by the maximum number of occurrences | |
;; of any term in the same document. | |
;; Define IDFi to be log2(N/ni). | |
;; The TF.IDF score for term i in document j is then defined to be | |
;; TFij x IDFi. The terms with the highest TF.IDF score are often | |
;; the terms that best characterize the topic of the document. | |
;; * doc-term-count:j - hash of term to count | |
;; * num-terms-in-doc - total number of terms in doc | |
;; * max-term - hash from doc to the term with highest count so far | |
;; * terms:j - a set of terms in doc | |
;; * term-docs:i - set of documents containing term | |
;; * docs - a set of docs | |
(defpackage #:snippets/tfidf | |
(:documentation | |
"Redis-based tfidf.") | |
(:use #:cl) | |
(:import-from #:lredis) | |
(:export | |
#:add-document | |
#:add-term | |
#:list-documents | |
#:terms-by-score)) | |
(in-package #:snippets/tfidf) | |
(defun add-document (doc) | |
(redis:sadd "docs" doc)) | |
(defun add-term (doc term) | |
(redis:sadd "terms" term) | |
(redis:hincrby "num-terms-in-doc" doc 1) | |
(redis:sadd (format nil "terms:~A" doc) term) | |
(redis:sadd (format nil "term-docs:~A" term) doc) | |
(let ((c (redis:hincrby (format nil "doc-term-count:~A" doc) term 1))) | |
(let ((max-term (redis:hget "max-term" doc))) | |
(when (or (null max-term) | |
(and (not (equal max-term term)) | |
(> c (doc-term-count max-term doc)))) | |
(redis:hset "max-term" doc term))))) | |
(defun doc-term-count (term doc) | |
(parse-integer (or (redis:hget (format nil "doc-term-count:~A" doc) | |
term) | |
"0"))) | |
(defun frequency (term doc) | |
"Return the frequency of the term i in document." | |
(/ (doc-term-count term doc) | |
(parse-integer (redis:hget "num-terms-in-doc" doc)))) | |
(defun max-term (doc) | |
"Return the term with the highest frequency in document." | |
(redis:hget "max-term" doc)) | |
(defun max-frequency (doc) | |
"Return the highest frequency of a term in document." | |
(frequency (max-term doc) doc)) | |
(defun number-of-documents () | |
"Return the total number of documents." | |
(redis:scard "docs")) | |
(defun number-of-documents-containing (term) | |
"Return the number of documents containing the term." | |
(redis:scard (format nil "term-docs:~A" term))) | |
(defun term-frequency (term doc) | |
(/ (frequency term doc) | |
(max-frequency doc))) | |
(defun inverse-document-frequency (term) | |
(log (/ (number-of-documents) | |
(number-of-documents-containing term)) | |
2)) | |
(defun score (term doc) | |
(* (term-frequency term doc) | |
(inverse-document-frequency term))) | |
(defun terms (doc) | |
(redis:smembers (format nil "terms:~A" doc))) | |
(defun terms-by-score (doc) | |
(sort | |
(mapcar (lambda (term) | |
(cons term (score term doc))) | |
(terms doc)) | |
#'> :key #'cdr)) | |
(defun list-documents () | |
(redis:smembers "docs")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment