Skip to content

Instantly share code, notes, and snippets.

View herdrick's full-sized avatar

Ethan Herdrick herdrick

View GitHub Profile
(def freqs-files (memoize (fn [pof]
(let [words (to-words pof)
word-count (count words)]
(apply hash-map (flatten (map (fn [[word count]]
[word (/ count word-count)])
(frequencies words))))))))
(def freqs (memoize (fn [pof]
(if (instance? java.io.File pof)
(freqs-files pof)
(def freq-files (memoize (fn [pof word]
(/ (or (get (frequencies-m (to-words pof)) word) 0)
(count-m (to-words pof))))))
(def freq (memoize (fn [pof word]
(if (instance? java.io.File pof)
(freq-files pof word)
(mean (vector (freq (first pof) word) ; combine frequencies by taking their unweighted mean.
(freq (second pof) word)))))))
@herdrick
herdrick / Agglomerative hierarchical document clustering in Clojure, refactored to replace hashtables with function calls.
Created July 17, 2010 23:26
Agglomerative hierarchical document clustering in Clojure, refactored to replace hashtables with function calls.
(ns radical (:use [incanter.core :only (abs sq sqrt)]
[incanter.stats :only (mean)]
[clojure.contrib.combinatorics :only (combinations)]))
(def set-m (memoize set))
(def sort-m (memoize sort))
(def flatten-m (memoize flatten))
(def frequencies-m (memoize frequencies))
(def count-m (memoize count))
(ns hc (:use [incanter.core :only (abs sq sqrt)]
[incanter.stats :only (mean)]
[clojure.contrib.combinatorics]
[clojure.set]))
(def to-words (fn [file-tree]
(if (coll? file-tree)
(apply concat (map to-words (flatten file-tree)))
(re-seq #"[a-z]+" (org.apache.commons.lang.StringUtils/lowerCase (slurp (str file-tree)))))))
(defn merge-general [m1 m2 f g]
(let [m1-only (difference (set (keys m1)) (set (keys m2)))
m2-only (difference (set (keys m2)) (set (keys m1)))]
(merge (merge-with f m1 m2)
(into {} (map (fn [k] [k (g (m1 k))]) m1-only))
(into {} (map (fn [k] [k (g (m2 k))]) m2-only)))))
user> (merge-general {:a 1, :b 2, :c 3} {:a 4, :b 5, :d 6} * sq)
{:d 36, :a 4, :b 10, :c 9}
(ns hc (:use [incanter.core :only (abs sq sqrt)]
[incanter.stats :only (mean)]
[clojure.contrib.combinatorics :only (combinations)]))
(defn make-rfo [{:keys [score relfreqs interesting rfos-or-file]}]
[score relfreqs interesting rfos-or-file])
(def score first)
(def relfreqs second)
(def interesting #(nth % 2))
(def rfos-or-file #(nth % 3))
(ns hc (:use [incanter.core :only (abs sq sqrt)]
[incanter.stats :only (mean)]
[clojure.contrib.combinatorics :only (combinations)]))
(def *interesting-words-count* 3)
(def *directory-string* "/Users/herdrick/Dropbox/clojure/hierarchical-classifier/data/mixed")
(def *txt-files* (seq (org.apache.commons.io.FileUtils/listFiles (new java.io.File *directory-string*) nil true)))
(def file->seq (memoize (fn [file]
(re-seq #"[a-z]+"