Skip to content

Instantly share code, notes, and snippets.

@ithayer
Created June 28, 2011 06:32
Show Gist options
  • Star 10 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ithayer/1050607 to your computer and use it in GitHub Desktop.
Save ithayer/1050607 to your computer and use it in GitHub Desktop.
Simple tf-idf in 30 lines of clojure. Inspired by a nice simple scala implementation: https://github.com/felipehummel/TinySearchEngine/blob/master/scala/tinySearch.scala and matches as closely as possible the computation.
(ns ignacio.tfidf (:require [clojure.contrib.string :as string])) ;; Simple tfidf in clojure, for fun.
(def stopwords (set (string/split #"\n" (slurp "./stopwords.txt"))))
(defn tokenize [raw-text] ;; Lowercases and splits on non-letters, non-numbers.
(remove stopwords (string/split #"[^a-z0-9äöüáéíóúãâêîôûàèìòùçñ]+" (string/lower-case raw-text))))
(defn idf2 [n-docs match] (Math/pow (Math/log (/ n-docs (count (keys match)))) 2))
(defn index-one [fname] ;; Index for one file. Given an fname, returns a map of token -> map of (fname, count)
(let [word-counts (frequencies (tokenize (slurp fname)))]
(zipmap (keys word-counts) (map (fn [c] {fname c}) (vals word-counts)))))
(defn accum-tfidf [total-doc-count match] ;; Given the total term count and a map of doc -> count, accumulate tfidf for docs.
(map (fn [doc w-count] {doc (* w-count (idf2 total-doc-count match))}) (keys match) (vals match)))
(defn search [db total-doc-count raw-text] ;; Returns accumulated tfidf for each doc.
(let [results (keep db (tokenize raw-text))] ;; Each result is one term lookup.
(apply merge-with + (mapcat (partial accum-tfidf total-doc-count) results))))
(defn read-and-search [db total-doc-count doc-norms raw-text]
(let [results (search db total-doc-count raw-text)
scores (take 3 (reverse (sort-by second (map (fn [k] [k (/ (results k) (doc-norms k))]) (keys results)))))]
(println "FOR: " raw-text " matched: " results)))
(defn -main [& args]
(let [db (apply merge-with merge (map index-one args))
doc-norms-raw (apply merge-with + (mapcat (partial accum-tfidf (count args)) (vals db)))
doc-norms (zipmap (keys doc-norms-raw) (map #(Math/sqrt %) (vals doc-norms-raw)))]
(map (partial read-and-search db (count args) doc-norms) (line-seq (java.io.BufferedReader. *in*)))))
@scottjad
Copy link

;; Here are a few improvements:

(defn fmap-vals [f m]
  (fmap (fn [[k v]] [k (f v)]) m))      ;fmap from contrib

(zipmap (keys word-counts) (map (fn [c] {fname c}) (vals word-counts)))
(zipmap (keys word-counts) (map fname (vals word-counts)))
(into {} (map (fn [[k v]] [k (fname v)]) word-counts))
(fmap (fn [[k v]] [k (fname v)]) word-counts)
(fmap-vals fname word-counts)

(map (fn [doc w-count] {doc (* w-count (idf2 total-doc-count match))}) (keys match) (vals match))
(map (fn [[doc w-count]] {doc (* w-count (idf2 total-doc-count match))}) match)

(zipmap (keys doc-norms-raw) (map #(Math/sqrt %) (vals doc-norms-raw)))
(into {} (map (fn [[k v]] [k (Math/sqrt v)]) doc-norms-raw))
(fmap (fn [[k v]] [k (Math/sqrt v)]) doc-norms-raw)
(fmap-vals #(Math/sqrt %) doc-norms-raw)

@ithayer
Copy link
Author

ithayer commented Jun 28, 2011 via email

@chernyshev-alex
Copy link

probably --> doall required
(doall (map (partial read-and-search db
(count args) doc-norms) (line-seq (java.io.BufferedReader. in))))))

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment