Created
February 10, 2011 09:38
-
-
Save j-kan/820201 to your computer and use it in GitHub Desktop.
read and print most frequent words
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns com.intangible-industries.cv | |
(:use [clojure.string :only [lower-case split]]) | |
(:use [clojure.contrib.duck-streams :only [read-lines]])) | |
(def stopwords | |
(apply hash-set | |
(split "the a an and or of for in on to with by this that these those some other it its we our as but not do does is be are can was were which so from" | |
#"\s"))) | |
(defn extract-words [filename] | |
(letfn [(filterfn [word] | |
(and (not (contains? stopwords word)) | |
(> (count word) 1)))] | |
(filter filterfn | |
(map lower-case | |
(flatten (map #(split % #"[\s\/\.\&\'\,\(\)\;\:\p{C}]+") | |
(read-lines filename))))))) | |
(defn sorted-frequencies [words] | |
(sort (fn [a b] (let [cmp-counts (compare (second b) (second a))] | |
(if (zero? cmp-counts) | |
(compare (first a) (first b)) | |
cmp-counts))) | |
(frequencies words))) | |
(defn print-words [freqs] | |
(apply print (map first | |
(filter (fn [[_ c]] (> c 1)) | |
freqs))) | |
(println)) | |
(print-words (sorted-frequencies (extract-words "cv.txt"))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment