Skip to content

Instantly share code, notes, and snippets.

@j-kan
Created February 10, 2011 09:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save j-kan/820201 to your computer and use it in GitHub Desktop.
Save j-kan/820201 to your computer and use it in GitHub Desktop.
read and print most frequent words
(ns com.intangible-industries.cv
(:use [clojure.string :only [lower-case split]])
(:use [clojure.contrib.duck-streams :only [read-lines]]))
(def stopwords
(apply hash-set
(split "the a an and or of for in on to with by this that these those some other it its we our as but not do does is be are can was were which so from"
#"\s")))
(defn extract-words [filename]
(letfn [(filterfn [word]
(and (not (contains? stopwords word))
(> (count word) 1)))]
(filter filterfn
(map lower-case
(flatten (map #(split % #"[\s\/\.\&\'\,\(\)\;\:\p{C}]+")
(read-lines filename)))))))
(defn sorted-frequencies [words]
(sort (fn [a b] (let [cmp-counts (compare (second b) (second a))]
(if (zero? cmp-counts)
(compare (first a) (first b))
cmp-counts)))
(frequencies words)))
(defn print-words [freqs]
(apply print (map first
(filter (fn [[_ c]] (> c 1))
freqs)))
(println))
(print-words (sorted-frequencies (extract-words "cv.txt")))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment