-
-
Save pcsanwald/f155848bfc242da37f74 to your computer and use it in GitHub Desktop.
TF-IDF code. works great locally, on elastic mapreduce, not so much!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns canalyze.cascade | |
(:use [cascalog.api] | |
[clojure.tools.logging] | |
[cascalog.more-taps :only (hfs-delimited)]) | |
(:require [clojure.string :as s] | |
[cascalog [ops :as c] [vars :as v]] | |
[clojure.data.json :refer [write-str read-str]] | |
) | |
(:gen-class)) | |
(defmapcatop separate-records [line] | |
"takes a line of input, which for our case is multiple records, | |
and separates them into individual records." | |
(let [results | |
(try (read-str line) | |
(catch Exception e (doall (warn (str "exception parsing line: " line) e))))] | |
results)) | |
(defmapcatop split-record [record] | |
"takes the body of a record (in our case, the email text) | |
and splits it into term tokens by regex" | |
(if (contains? record "body") | |
(s/split (record "body") #"[\[\]\\\(\),.)\s]+") | |
nil) | |
) | |
(defn get-doc-id [record] | |
(record "id")) | |
(defn scrub-text [s] | |
"trim open whitespaces and lower case" | |
((comp s/trim s/lower-case) s)) | |
(defn word-count [src] | |
"basic word count. the underscore below just ignores the doc_id" | |
(<- [?word ?count] | |
(src _ ?word) | |
(c/count ?count))) | |
(defn etl-docs [docs stop] | |
(<- [?doc-id ?word] | |
(docs ?line) | |
(separate-records ?line :> ?record) | |
(split-record ?record :> ?word-dirty) | |
(get-doc-id ?record :> ?doc-id) | |
(scrub-text ?word-dirty :> ?word) | |
(stop ?word :> false))) | |
(defn D [src] | |
(let [src (select-fields src ["?doc-id"])] | |
(<- [?n-docs] | |
(src ?doc-id) | |
(c/distinct-count ?doc-id :> ?n-docs)))) | |
(defn DF [src] | |
(<- [?df-word ?df-count] | |
(src ?doc-id ?df-word) | |
(c/distinct-count ?doc-id ?df-word :> ?df-count))) | |
(defn TF [src] | |
(<- [?doc-id ?tf-word ?tf-count] | |
(src ?doc-id ?tf-word) | |
(c/count ?tf-count))) | |
(defn tf-idf-formula [tf-count df-count n-docs] | |
(->> (+ 1.0 df-count) | |
(div n-docs) | |
(Math/log) | |
(* tf-count))) | |
(defn TF-IDF [src] | |
(let [n-doc (first (flatten (??- (D src))))] | |
(<- [?doc-id ?tf-idf ?tf-word] | |
((TF src) ?doc-id ?tf-word ?tf-count) | |
((DF src) ?tf-word ?df-count) | |
(tf-idf-formula ?tf-count ?df-count n-doc :> ?tf-idf)))) | |
(defn -main [in stop out tfidf & args] | |
(let [stop (hfs-delimited stop :skip-header? true) | |
docs (hfs-textline in :skip-header? false) | |
src (etl-docs docs stop)] | |
(?- (hfs-delimited tfidf) | |
(TF-IDF src)) | |
(?- (hfs-textline out) | |
(word-count src)) | |
)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment