-
-
Save raek/1050828 to your computer and use it in GitHub Desktop.
Simple tf-idf in 30 lines of clojure. Inspired by a nice simple scala implementation: https://github.com/felipehummel/TinySearchEngine/blob/master/scala/tinySearch.scala and matches as closely as possible the computation.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns ignacio.tfidf (:require [clojure.contrib.string :as string])) ;; Simple tfidf in clojure, for fun. | |
(def *stopwords* (set (string/split #"\n" (slurp "./stopwords.txt")))) | |
(defn tokenize [raw-text] ;; Lowercases and splits on non-letters, non-numbers. | |
(remove *stopwords* (string/split #"[^a-z0-9äöüáéíóúãâêîôûàèìòùçñ]+" (string/lower-case raw-text)))) | |
(defn idf2 [n-docs match] (Math/pow (Math/log (/ n-docs (count (keys match)))) 2)) | |
(defn index-one [fname] ;; Index for one file. Given an fname, returns a map of token -> map of (fname, count) | |
(let [word-counts (frequencies (tokenize (slurp fname)))] | |
(zipmap (keys word-counts) (map (fn [c] {fname c}) (vals word-counts))))) | |
(defn accum-tfidf [total-doc-count match] ;; Given n docs and a map of doc -> count, accumulate tfidf for docs. | |
(map (fn [doc w-count] {doc (* w-count (idf2 total-doc-count match))}) (keys match) (vals match))) | |
(defn search [db total-doc-count raw-text] ;; Returns accumulated tfidf for each doc. | |
(let [results (remove nil? (map db (tokenize raw-text)))] ;; Each result is one term lookup. | |
(apply merge-with + (mapcat (partial accum-tfidf total-doc-count) results)))) | |
(defn read-and-search [db total-doc-count doc-norms raw-text] | |
(let [results (search db total-doc-count raw-text) | |
scores (take 3 (reverse (sort-by second (map (fn [k] [k (/ (results k) (doc-norms k))]) (keys results)))))] | |
(println "FOR: " raw-text " matched: " results))) | |
(defn -main [& args] | |
(let [db (apply merge-with merge (map index-one args)) | |
doc-norms-raw (apply merge-with + (mapcat (partial accum-tfidf (count args)) (vals db))) | |
doc-norms (zipmap (keys doc-norms-raw) (map #(Math/sqrt %) (vals doc-norms-raw)))] | |
(map (partial read-and-search db (count args) doc-norms) (line-seq (java.io.BufferedReader. *in*))))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment