ithayer/tf-idf.clj

## tf-idf.clj
(ns ignacio.tfidf (:require [clojure.contrib.string  :as string])) ;; Simple tfidf in clojure, for fun.

(def stopwords (set (string/split #"\n" (slurp "./stopwords.txt"))))

(defn tokenize [raw-text] ;; Lowercases and splits on non-letters, non-numbers.
  (remove stopwords (string/split #"[^a-z0-9äöüáéíóúãâêîôûàèìòùçñ]+" (string/lower-case raw-text))))

(defn idf2 [n-docs match] (Math/pow (Math/log (/ n-docs (count (keys match)))) 2))

(defn index-one [fname] ;; Index for one file. Given an fname, returns a map of token -> map of (fname, count)
  (let [word-counts (frequencies (tokenize (slurp fname)))]
    (zipmap (keys word-counts) (map (fn [c] {fname c}) (vals word-counts)))))

(defn accum-tfidf [total-doc-count match] ;; Given the total term count and a map of doc -> count, accumulate tfidf for docs.
  (map (fn [doc w-count] {doc (* w-count (idf2 total-doc-count match))}) (keys match) (vals match)))

(defn search [db total-doc-count raw-text] ;; Returns accumulated tfidf for each doc.
  (let [results (keep db (tokenize raw-text))] ;; Each result is one term lookup.
    (apply merge-with + (mapcat (partial accum-tfidf total-doc-count) results))))

(defn read-and-search [db total-doc-count doc-norms raw-text]
  (let [results (search db total-doc-count raw-text)
	scores  (take 3 (reverse (sort-by second (map (fn [k] [k (/ (results k) (doc-norms k))]) (keys results)))))]
    (println "FOR: " raw-text " matched: " results)))

(defn -main [& args]
  (let [db            (apply merge-with merge (map index-one args))
        doc-norms-raw (apply merge-with + (mapcat (partial accum-tfidf (count args)) (vals db)))
        doc-norms     (zipmap (keys doc-norms-raw) (map #(Math/sqrt %) (vals doc-norms-raw)))]
   (map (partial read-and-search db (count args) doc-norms) (line-seq (java.io.BufferedReader. *in*)))))
	(ns ignacio.tfidf (:require [clojure.contrib.string :as string])) ;; Simple tfidf in clojure, for fun.

	(def stopwords (set (string/split #"\n" (slurp "./stopwords.txt"))))

	(defn tokenize [raw-text] ;; Lowercases and splits on non-letters, non-numbers.
	(remove stopwords (string/split #"[^a-z0-9äöüáéíóúãâêîôûàèìòùçñ]+" (string/lower-case raw-text))))

	(defn idf2 [n-docs match] (Math/pow (Math/log (/ n-docs (count (keys match)))) 2))

	(defn index-one [fname] ;; Index for one file. Given an fname, returns a map of token -> map of (fname, count)
	(let [word-counts (frequencies (tokenize (slurp fname)))]
	(zipmap (keys word-counts) (map (fn [c] {fname c}) (vals word-counts)))))

	(defn accum-tfidf [total-doc-count match] ;; Given the total term count and a map of doc -> count, accumulate tfidf for docs.
	(map (fn [doc w-count] {doc (* w-count (idf2 total-doc-count match))}) (keys match) (vals match)))

	(defn search [db total-doc-count raw-text] ;; Returns accumulated tfidf for each doc.
	(let [results (keep db (tokenize raw-text))] ;; Each result is one term lookup.
	(apply merge-with + (mapcat (partial accum-tfidf total-doc-count) results))))

	(defn read-and-search [db total-doc-count doc-norms raw-text]
	(let [results (search db total-doc-count raw-text)
	scores (take 3 (reverse (sort-by second (map (fn [k] [k (/ (results k) (doc-norms k))]) (keys results)))))]
	(println "FOR: " raw-text " matched: " results)))

	(defn -main [& args]
	(let [db (apply merge-with merge (map index-one args))
	doc-norms-raw (apply merge-with + (mapcat (partial accum-tfidf (count args)) (vals db)))
	doc-norms (zipmap (keys doc-norms-raw) (map #(Math/sqrt %) (vals doc-norms-raw)))]
	(map (partial read-and-search db (count args) doc-norms) (line-seq (java.io.BufferedReader. in)))))