Skip to content

Instantly share code, notes, and snippets.

@herdrick
Created June 6, 2010 02:27
Show Gist options
  • Save herdrick/427211 to your computer and use it in GitHub Desktop.
Save herdrick/427211 to your computer and use it in GitHub Desktop.
(ns hc (:use [incanter.core :only (abs sq sqrt)]
[incanter.stats :only (mean)]
[clojure.contrib.combinatorics :only (combinations)]))
(defn make-rfo [{:keys [score relfreqs interesting rfos-or-file]}]
[score relfreqs interesting rfos-or-file])
(def score first)
(def relfreqs second)
(def interesting #(nth % 2))
(def rfos-or-file #(nth % 3))
(defn rfo= [rfo1 rfo2]
(= (rfos-or-file rfo1) (rfos-or-file rfo2)))
(def *interesting-words-count* 3)
(def file->seq (memoize (fn [file]
(re-seq #"[a-z]+"
(org.apache.commons.lang.StringUtils/lowerCase (slurp (.toString file)))))))
(defn freqs [words]
(reduce (fn [freqs obj]
(merge-with + freqs {obj 1}))
{} words))
(def words->relative-freq (memoize (fn [docu]
(let [freqs (freqs docu)
word-count (count docu)]
(reduce (fn [rel-freqs key]
(conj rel-freqs [key (/ (float (freqs key)) word-count)])) ;would be clearer as (merge rel-freqs {key (/ (float (freqs key)) word-count)}) maybe
{} docu)))))
;euclidean distance
(defn euclid [relfreqs1 relfreqs2 word-list]
(sqrt (reduce + (map (fn [word]
(sq (abs (- (get relfreqs1 word 0) (get relfreqs2 word 0)))))
word-list))))
(defn combine-relfreqs [rf1 rf2]
(merge-with #(mean [% %2]) rf1 rf2)) ; i'm just combining relfreqs taking their (unweighted) mean.
(def relative-freqs (memoize (fn [rfo]
(let [r-o-f (rfos-or-file rfo)]
(if (instance? java.io.File r-o-f)
(words->relative-freq (file->seq r-o-f))
(combine-relfreqs (relative-freqs (first r-o-f))
(relative-freqs (second r-o-f))))))))
(defn interesting-words [relfreqs omni-relfreq count]
(take count (sort #(> (abs (second %)) (abs (second %2)))
(map (fn [[word freq]]
[word (- (or (get relfreqs word) 0) freq)])
omni-relfreq))))
(def score-pair (memoize (fn [word-list [rfo1 rfo2]]
(make-rfo {:score (euclid (relfreqs rfo1) (relfreqs rfo2) word-list)
:rfos-or-file [(make-rfo {:score (score rfo1) :interesting (interesting rfo1) :rfos-or-file (rfos-or-file rfo1)}) ;making a mock rfo here preserving the values of rfo1. lacks: relfreqs
(make-rfo {:score (score rfo2) :interesting (interesting rfo2) :rfos-or-file (rfos-or-file rfo2)})]}))))
(defn best-pairing [rfos word-list omni-relfreq]
(let [combos (sort (fn [rfo1 rfo2] (compare (score rfo1) (score rfo2))) ; rfo1 and rfo2 are mock rfos, each representing a candidate pair. the best scoring one will be made into a full rfo.
(map (partial score-pair word-list)
(combinations rfos 2)))
best-pair (first combos)
relfreqs (relative-freqs best-pair)]
(make-rfo {:score (score best-pair) :relfreqs relfreqs :interesting (interesting-words relfreqs omni-relfreq *interesting-words-count*) :rfos-or-file (rfos-or-file best-pair)})))
;makes an agglomerative hierarchical cluster of the rfos.
(defn cluster [rfos word-list omni-relfreq]
(if (= (count rfos) 1)
rfos
(let [best-pairing-rfo (best-pairing rfos word-list omni-relfreq)
rfos-cleaned (filter (complement (fn [rfo]
(or (rfo= rfo (first (rfos-or-file best-pairing-rfo)))
(rfo= rfo (second (rfos-or-file best-pairing-rfo))))))
rfos)]
(cluster (conj rfos-cleaned best-pairing-rfo) word-list omni-relfreq))))
(def *directory-string* "/Users/herdrick/Dropbox/clojure/hierarchical-classifier/data/mixed")
(def *txt-files* (seq (org.apache.commons.io.FileUtils/listFiles (new java.io.File *directory-string*) nil true)))
(def *omni-doc* (apply concat (map file->seq *txt-files*)))
(def *corpus-word-list* (set *omni-doc*))
(def *corpus-relfreqs* (words->relative-freq *omni-doc*))
(def *docs-rfos* (map (fn [file]
(let [relfreqs (words->relative-freq (file->seq file))]
(make-rfo {:relfreqs relfreqs :interesting (interesting-words relfreqs *corpus-relfreqs* *interesting-words-count*) :rfos-or-file file})))
*txt-files*))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment