Last active
August 29, 2015 14:14
-
-
Save mchampine/1ab8efe3871acfa705f3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns ufo.core | |
(:use (incanter core stats charts datasets)) | |
(require [clojure.data.json :as json] | |
[clj-time.format :as tf] | |
[net.cgrand.enlive-html :as html] | |
[clojure.string :as string])) | |
;; URL with one month of data (january, 2015) | |
(def ufourl "http://www.nuforc.org/webreports/ndxe201501.html") | |
;; Use Enlive to fetch page and parse the HTML into a sequence of nodes. | |
(defn fetch-url [url] | |
(html/html-resource (java.net.URL. url))) | |
(def udata (fetch-url ufourl)) ; get it from the web | |
(def alt-udata (read-string (slurp "fetched.dat"))) ; or from a file | |
;; Keys for the tabular ufo data | |
(defrecord UData [date-time city state shape duration summary posted]) | |
;; Use Enlive to select just the relevant html fields | |
(defn ensel [d] (map html/text (html/select d [:tr :td]))) | |
;; Convert node sequence into list of UData records | |
(defn ufo-data [udat] | |
(let [rawdat (partition 7 (ensel udat)) ;; enlive select, 7 strings per record | |
fields [:date-time :city :state :shape :duration :summary :posted]] | |
(map #(zipmap fields %) rawdat))) | |
;; another way to builsd the records using UData record | |
(defn ufo-data2 [udat] | |
(let [rawdat (partition 7 (ensel udat))] | |
(map (partial apply ->UData) rawdat))) | |
(def ufos (ufo-data udata)) ; Ok, now we have our data! | |
(defn ufos-from-url ; convenience function | |
"given a url, return a list of maps of ufo entries" | |
[u] | |
(let [ud (fetch-url u)] | |
(ufo-data2 ud))) | |
;; Analyze "shape" data | |
(frequencies (map :shape ufos)) ; hmm, no shape on 4 entries. Maybe | |
; they should be labeled unknown as well. | |
(defn fix-blank-shapes | |
"fix blank :shape entries to be 'Unknown'" | |
[c] | |
(letfn [(blank->unk [shape] | |
(if (empty? shape) "Unknown" shape)) | |
(fix-a-blank [r] | |
(update-in r [:shape] blank->unk))] | |
(map fix-a-blank c))) | |
;; generalization of fix-blank-shapes | |
;; could also supply the replacement function but then why not use a one-off? | |
(defn fix-unknowns | |
"change blank field to 'Unknown'" | |
[c field] | |
(letfn [(blank->unk [s] | |
(if (empty? s) "Unknown" s)) | |
(fix-a-blank [r] | |
(update-in r [field] blank->unk))] | |
(map fix-a-blank c))) | |
(defn barchart | |
"Display an Incanter bar-chart for a frequencies map." | |
[xlab ylab fq] | |
(view (bar-chart | |
(map first fq) | |
(map second fq) | |
:x-label xlab | |
:y-label ylab))) | |
;; Incanter bar chart of UFO Shape Frequency | |
(->> ufos | |
fix-blank-shapes | |
(map :shape) | |
frequencies | |
(sort-by second) | |
reverse | |
(barchart "UFO Shape" "Frequency")) | |
;; UFO Sightings by Date | |
;; need a date extractor | |
(defn date-extractor [s] | |
(->> s | |
(re-seq #"\w+") | |
second | |
(Integer.))) | |
(defn month-extractor [s] | |
(->> s | |
(re-seq #"\w+") | |
first | |
(Integer.))) | |
;; Incanter bar chart of UFO Sighting Dates | |
(->> ufos | |
(map :date-time) | |
(map date-extractor) | |
frequencies | |
(sort-by first) ;; date of the month | |
(barchart "Sighting Date" "Frequency")) | |
;; Frequency of sightings per month | |
;; Get all events for a year | |
;; per-month links at http://www.nuforc.org/webreports/ndxevent.html | |
(defn getlinks | |
"given a fetched page of links [d], select just the href" | |
[d] | |
(let [linkels (html/select d [:tr :td :a])] | |
(map #(get-in % [:attrs :href]) linkels))) | |
(defn urls-for-year | |
"given a year <yyyy> generate a list all url for that year" | |
[ys] | |
(let [baseurl "http://www.nuforc.org/webreports/" | |
d (getlinks (fetch-url (str baseurl "ndxevent.html"))) | |
ff (filter #(= ys(subs % 4 8)) d)] | |
(map #(str baseurl %) ff))) | |
(urls-for-year "2014") | |
;; example | |
(def ufourls2014 (urls-for-year "2014")) | |
(first (ufos-from-url (first ufourls2014))) | |
;; get all the ufo data for a given year | |
(def ufoyear2015dat (mapcat ufos-from-url (urls-for-year "2015"))) | |
(def ufoyear2014dat (mapcat ufos-from-url (urls-for-year "2014"))) ;; yikes long running! | |
;; graph it | |
(->> ufoyear2014dat | |
(map :date-time) | |
(map month-extractor) | |
frequencies | |
(sort-by first) ;; date of the month | |
(barchart "Sighting Month" "Frequency")) | |
;; save it | |
(spit "ndxe201501-ufos.edn" (pr-str ufos)) | |
;; Element Shape | |
;; {:tag :tr, | |
;; :attrs {:valign "TOP"}, | |
;; :content | |
;; ("\n" | |
;; {:tag :td, | |
;; :attrs nil, | |
;; :content | |
;; ({:tag :font, | |
;; :attrs | |
;; {:color "#000000", | |
;; :face "Calibri", | |
;; :style "FONT-SIZE:11pt"}, | |
;; :content | |
;; ({:tag :a, | |
;; :attrs {:href "ndxe020008.html"}, | |
;; :content ("08/0200")})})} | |
;; Fequency of Words in Summary | |
;; tbd | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I like this example; thank you.