CarlSmotricz/countries.clj

## countries.clj
(ns countries.core
  (:use [clojure.java.io :only [reader]]
        [clojure.string :only [lower-case join]]
        [clojure.set :only [union]]))

; Regexp for a word in a city or country name.
; A country name may contain multiple words (space separated).
; African names may contain "!",
; Arabic names have lots of apostrophes (also backquotes),
; French (and other) names have hyphens.
; The Alpha class takes care of international chars.
(def rex-name #"[-!'`[\p{Alpha}]]+")

; Set of lower-cased country names.
; May be multi-word, i.e. contain blanks.
(def countries (->> "resources/countries.txt"
                 reader
                 line-seq
                 (map lower-case)
                 set))

; Set of lower-cased city names.
; May be multi-word, i.e. contain blanks.
(def cities (->> "resources/city.txt"
              reader
              line-seq
              (map lower-case)
              set))

; Text to be scanned, as a sequence of words as per rex-name.
(def text (->>  "resources/article.txt"
            reader
            line-seq
            (mapcat #(re-seq rex-name %))
            (map lower-case)))


; Print up to 5 of the "longest" names, i.e. those
; with the greatest number of words (mnp).
; This is for interest/verification only;
; it's not part of the required functionality.
(defn print-longest-names
  [mnp names-set]
  (->> names-set
    (filter #(= (count (re-seq rex-name %)) mnp))
    (take 5)
    (map println)))

; Return the sub-map of freqs with non-zero counts.
(defn nonzero-freqs
  [freqs]
  (apply hash-map (apply concat (filter (fn [[k v]] (pos? v)) freqs))))

; For a single position in the sequence of words, return a new frequency
; map with an incremented count for the name at the beginninig of the
; sequence, or the unchanged frequency map.
; For each position, we need to loop over possible word combinations
; up to the maximum number of word parts for any name in the data.
; e.g. "NEW" "NEW YORK" "NEW YORK CITY"
; This function is meant to be used with REDUCE on a list of frequencies
; initially all 0 and a collection of subsets of the text sequence provided
; by (ITERATE TEXT).
; A function used with REDUCE takes only 2 args! So the first 2 args are
; provided by currying (using PARTIAL) in the function calling REDUCE.
(defn tally
  [parts-end names freqs txt-seq]
  (if-let [name (some names (for [n (range 1 parts-end)] (join " " (take n txt-seq))))]
    (assoc freqs name (inc (get freqs name)))
    freqs))

; Given a set of country names, a set of city names and a sequence of
; text words, find the frequencies of each kind of name.
; We calculate frequencies for a combined set of country and city names,
; filter out the non-zero ones, and then separate countries and cities for display.
(defn find-frequencies [country-set city-set text-seq]
  (let [merged-names (union country-set city-set)
        most-name-parts (apply max (map #(count (re-seq rex-name %)) merged-names))
        parts-end (inc most-name-parts)
        ; _ (print-longest-names most-name-parts merged-names)
        merged-freqs (apply hash-map (interleave merged-names (repeat 0)))
        ; _ (prn "merged-freqs:" merged-freqs)
        ; -----------------------------------------------------------
        ; The heavy lifting is done in this call to REDUCE with TALLY
        frequencies (reduce (partial tally parts-end merged-names)
                            merged-freqs
                            (take-while seq (iterate rest text)))
        ; -----------------------------------------------------------
        filtered-freqs (nonzero-freqs frequencies)]
    (println (format "Country frequencies:\n%s\n----------\nCity frequencies:\n%s"
                     (select-keys filtered-freqs countries)
                     (select-keys filtered-freqs cities)))))

(time (find-frequencies countries cities text))
	(ns countries.core
	(:use [clojure.java.io :only [reader]]
	[clojure.string :only [lower-case join]]
	[clojure.set :only [union]]))

	; Regexp for a word in a city or country name.
	; A country name may contain multiple words (space separated).
	; African names may contain "!",
	; Arabic names have lots of apostrophes (also backquotes),
	; French (and other) names have hyphens.
	; The Alpha class takes care of international chars.
	(def rex-name #"[-!'`[\p{Alpha}]]+")

	; Set of lower-cased country names.
	; May be multi-word, i.e. contain blanks.
	(def countries (->> "resources/countries.txt"
	reader
	line-seq
	(map lower-case)
	set))

	; Set of lower-cased city names.
	; May be multi-word, i.e. contain blanks.
	(def cities (->> "resources/city.txt"
	reader
	line-seq
	(map lower-case)
	set))

	; Text to be scanned, as a sequence of words as per rex-name.
	(def text (->> "resources/article.txt"
	reader
	line-seq
	(mapcat #(re-seq rex-name %))
	(map lower-case)))


	; Print up to 5 of the "longest" names, i.e. those
	; with the greatest number of words (mnp).
	; This is for interest/verification only;
	; it's not part of the required functionality.
	(defn print-longest-names
	[mnp names-set]
	(->> names-set
	(filter #(= (count (re-seq rex-name %)) mnp))
	(take 5)
	(map println)))

	; Return the sub-map of freqs with non-zero counts.
	(defn nonzero-freqs
	[freqs]
	(apply hash-map (apply concat (filter (fn [[k v]] (pos? v)) freqs))))

	; For a single position in the sequence of words, return a new frequency
	; map with an incremented count for the name at the beginninig of the
	; sequence, or the unchanged frequency map.
	; For each position, we need to loop over possible word combinations
	; up to the maximum number of word parts for any name in the data.
	; e.g. "NEW" "NEW YORK" "NEW YORK CITY"
	; This function is meant to be used with REDUCE on a list of frequencies
	; initially all 0 and a collection of subsets of the text sequence provided
	; by (ITERATE TEXT).
	; A function used with REDUCE takes only 2 args! So the first 2 args are
	; provided by currying (using PARTIAL) in the function calling REDUCE.
	(defn tally
	[parts-end names freqs txt-seq]
	(if-let [name (some names (for [n (range 1 parts-end)] (join " " (take n txt-seq))))]
	(assoc freqs name (inc (get freqs name)))
	freqs))

	; Given a set of country names, a set of city names and a sequence of
	; text words, find the frequencies of each kind of name.
	; We calculate frequencies for a combined set of country and city names,
	; filter out the non-zero ones, and then separate countries and cities for display.
	(defn find-frequencies [country-set city-set text-seq]
	(let [merged-names (union country-set city-set)
	most-name-parts (apply max (map #(count (re-seq rex-name %)) merged-names))
	parts-end (inc most-name-parts)
	; _ (print-longest-names most-name-parts merged-names)
	merged-freqs (apply hash-map (interleave merged-names (repeat 0)))
	; _ (prn "merged-freqs:" merged-freqs)
	; -----------------------------------------------------------
	; The heavy lifting is done in this call to REDUCE with TALLY
	frequencies (reduce (partial tally parts-end merged-names)
	merged-freqs
	(take-while seq (iterate rest text)))
	; -----------------------------------------------------------
	filtered-freqs (nonzero-freqs frequencies)]
	(println (format "Country frequencies:\n%s\n----------\nCity frequencies:\n%s"
	(select-keys filtered-freqs countries)
	(select-keys filtered-freqs cities)))))

	(time (find-frequencies countries cities text))