Created
August 30, 2015 13:10
-
-
Save CarlSmotricz/0c886cd7701d393ee66b to your computer and use it in GitHub Desktop.
Some Clojure code to calculate frequencies for names of countries and cities in a given text.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns countries.core | |
(:use [clojure.java.io :only [reader]] | |
[clojure.string :only [lower-case join]] | |
[clojure.set :only [union]])) | |
; Regexp for a word in a city or country name. | |
; A country name may contain multiple words (space separated). | |
; African names may contain "!", | |
; Arabic names have lots of apostrophes (also backquotes), | |
; French (and other) names have hyphens. | |
; The Alpha class takes care of international chars. | |
(def rex-name #"[-!'`[\p{Alpha}]]+") | |
; Set of lower-cased country names. | |
; May be multi-word, i.e. contain blanks. | |
(def countries (->> "resources/countries.txt" | |
reader | |
line-seq | |
(map lower-case) | |
set)) | |
; Set of lower-cased city names. | |
; May be multi-word, i.e. contain blanks. | |
(def cities (->> "resources/city.txt" | |
reader | |
line-seq | |
(map lower-case) | |
set)) | |
; Text to be scanned, as a sequence of words as per rex-name. | |
(def text (->> "resources/article.txt" | |
reader | |
line-seq | |
(mapcat #(re-seq rex-name %)) | |
(map lower-case))) | |
; Print up to 5 of the "longest" names, i.e. those | |
; with the greatest number of words (mnp). | |
; This is for interest/verification only; | |
; it's not part of the required functionality. | |
(defn print-longest-names | |
[mnp names-set] | |
(->> names-set | |
(filter #(= (count (re-seq rex-name %)) mnp)) | |
(take 5) | |
(map println))) | |
; Return the sub-map of freqs with non-zero counts. | |
(defn nonzero-freqs | |
[freqs] | |
(apply hash-map (apply concat (filter (fn [[k v]] (pos? v)) freqs)))) | |
; For a single position in the sequence of words, return a new frequency | |
; map with an incremented count for the name at the beginninig of the | |
; sequence, or the unchanged frequency map. | |
; For each position, we need to loop over possible word combinations | |
; up to the maximum number of word parts for any name in the data. | |
; e.g. "NEW" "NEW YORK" "NEW YORK CITY" | |
; This function is meant to be used with REDUCE on a list of frequencies | |
; initially all 0 and a collection of subsets of the text sequence provided | |
; by (ITERATE TEXT). | |
; A function used with REDUCE takes only 2 args! So the first 2 args are | |
; provided by currying (using PARTIAL) in the function calling REDUCE. | |
(defn tally | |
[parts-end names freqs txt-seq] | |
(if-let [name (some names (for [n (range 1 parts-end)] (join " " (take n txt-seq))))] | |
(assoc freqs name (inc (get freqs name))) | |
freqs)) | |
; Given a set of country names, a set of city names and a sequence of | |
; text words, find the frequencies of each kind of name. | |
; We calculate frequencies for a combined set of country and city names, | |
; filter out the non-zero ones, and then separate countries and cities for display. | |
(defn find-frequencies [country-set city-set text-seq] | |
(let [merged-names (union country-set city-set) | |
most-name-parts (apply max (map #(count (re-seq rex-name %)) merged-names)) | |
parts-end (inc most-name-parts) | |
; _ (print-longest-names most-name-parts merged-names) | |
merged-freqs (apply hash-map (interleave merged-names (repeat 0))) | |
; _ (prn "merged-freqs:" merged-freqs) | |
; ----------------------------------------------------------- | |
; The heavy lifting is done in this call to REDUCE with TALLY | |
frequencies (reduce (partial tally parts-end merged-names) | |
merged-freqs | |
(take-while seq (iterate rest text))) | |
; ----------------------------------------------------------- | |
filtered-freqs (nonzero-freqs frequencies)] | |
(println (format "Country frequencies:\n%s\n----------\nCity frequencies:\n%s" | |
(select-keys filtered-freqs countries) | |
(select-keys filtered-freqs cities))))) | |
(time (find-frequencies countries cities text)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment