Skip to content

Instantly share code, notes, and snippets.

@pcsanwald
Created March 14, 2013 22:15
Show Gist options
  • Save pcsanwald/f155848bfc242da37f74 to your computer and use it in GitHub Desktop.
Save pcsanwald/f155848bfc242da37f74 to your computer and use it in GitHub Desktop.
TF-IDF code. works great locally, on elastic mapreduce, not so much!
(ns canalyze.cascade
(:use [cascalog.api]
[clojure.tools.logging]
[cascalog.more-taps :only (hfs-delimited)])
(:require [clojure.string :as s]
[cascalog [ops :as c] [vars :as v]]
[clojure.data.json :refer [write-str read-str]]
)
(:gen-class))
(defmapcatop separate-records [line]
"takes a line of input, which for our case is multiple records,
and separates them into individual records."
(let [results
(try (read-str line)
(catch Exception e (doall (warn (str "exception parsing line: " line) e))))]
results))
(defmapcatop split-record [record]
"takes the body of a record (in our case, the email text)
and splits it into term tokens by regex"
(if (contains? record "body")
(s/split (record "body") #"[\[\]\\\(\),.)\s]+")
nil)
)
(defn get-doc-id [record]
(record "id"))
(defn scrub-text [s]
"trim open whitespaces and lower case"
((comp s/trim s/lower-case) s))
(defn word-count [src]
"basic word count. the underscore below just ignores the doc_id"
(<- [?word ?count]
(src _ ?word)
(c/count ?count)))
(defn etl-docs [docs stop]
(<- [?doc-id ?word]
(docs ?line)
(separate-records ?line :> ?record)
(split-record ?record :> ?word-dirty)
(get-doc-id ?record :> ?doc-id)
(scrub-text ?word-dirty :> ?word)
(stop ?word :> false)))
(defn D [src]
(let [src (select-fields src ["?doc-id"])]
(<- [?n-docs]
(src ?doc-id)
(c/distinct-count ?doc-id :> ?n-docs))))
(defn DF [src]
(<- [?df-word ?df-count]
(src ?doc-id ?df-word)
(c/distinct-count ?doc-id ?df-word :> ?df-count)))
(defn TF [src]
(<- [?doc-id ?tf-word ?tf-count]
(src ?doc-id ?tf-word)
(c/count ?tf-count)))
(defn tf-idf-formula [tf-count df-count n-docs]
(->> (+ 1.0 df-count)
(div n-docs)
(Math/log)
(* tf-count)))
(defn TF-IDF [src]
(let [n-doc (first (flatten (??- (D src))))]
(<- [?doc-id ?tf-idf ?tf-word]
((TF src) ?doc-id ?tf-word ?tf-count)
((DF src) ?tf-word ?df-count)
(tf-idf-formula ?tf-count ?df-count n-doc :> ?tf-idf))))
(defn -main [in stop out tfidf & args]
(let [stop (hfs-delimited stop :skip-header? true)
docs (hfs-textline in :skip-header? false)
src (etl-docs docs stop)]
(?- (hfs-delimited tfidf)
(TF-IDF src))
(?- (hfs-textline out)
(word-count src))
))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment