Skip to content

Instantly share code, notes, and snippets.

Last active August 29, 2015 14:14
Show Gist options
  • Save mchampine/1ab8efe3871acfa705f3 to your computer and use it in GitHub Desktop.
Save mchampine/1ab8efe3871acfa705f3 to your computer and use it in GitHub Desktop.
(ns ufo.core
(:use (incanter core stats charts datasets))
(require [ :as json]
[clj-time.format :as tf]
[net.cgrand.enlive-html :as html]
[clojure.string :as string]))
;; URL with one month of data (january, 2015)
(def ufourl "")
;; Use Enlive to fetch page and parse the HTML into a sequence of nodes.
(defn fetch-url [url]
(html/html-resource ( url)))
(def udata (fetch-url ufourl)) ; get it from the web
(def alt-udata (read-string (slurp "fetched.dat"))) ; or from a file
;; Keys for the tabular ufo data
(defrecord UData [date-time city state shape duration summary posted])
;; Use Enlive to select just the relevant html fields
(defn ensel [d] (map html/text (html/select d [:tr :td])))
;; Convert node sequence into list of UData records
(defn ufo-data [udat]
(let [rawdat (partition 7 (ensel udat)) ;; enlive select, 7 strings per record
fields [:date-time :city :state :shape :duration :summary :posted]]
(map #(zipmap fields %) rawdat)))
;; another way to builsd the records using UData record
(defn ufo-data2 [udat]
(let [rawdat (partition 7 (ensel udat))]
(map (partial apply ->UData) rawdat)))
(def ufos (ufo-data udata)) ; Ok, now we have our data!
(defn ufos-from-url ; convenience function
"given a url, return a list of maps of ufo entries"
(let [ud (fetch-url u)]
(ufo-data2 ud)))
;; Analyze "shape" data
(frequencies (map :shape ufos)) ; hmm, no shape on 4 entries. Maybe
; they should be labeled unknown as well.
(defn fix-blank-shapes
"fix blank :shape entries to be 'Unknown'"
(letfn [(blank->unk [shape]
(if (empty? shape) "Unknown" shape))
(fix-a-blank [r]
(update-in r [:shape] blank->unk))]
(map fix-a-blank c)))
;; generalization of fix-blank-shapes
;; could also supply the replacement function but then why not use a one-off?
(defn fix-unknowns
"change blank field to 'Unknown'"
[c field]
(letfn [(blank->unk [s]
(if (empty? s) "Unknown" s))
(fix-a-blank [r]
(update-in r [field] blank->unk))]
(map fix-a-blank c)))
(defn barchart
"Display an Incanter bar-chart for a frequencies map."
[xlab ylab fq]
(view (bar-chart
(map first fq)
(map second fq)
:x-label xlab
:y-label ylab)))
;; Incanter bar chart of UFO Shape Frequency
(->> ufos
(map :shape)
(sort-by second)
(barchart "UFO Shape" "Frequency"))
;; UFO Sightings by Date
;; need a date extractor
(defn date-extractor [s]
(->> s
(re-seq #"\w+")
(defn month-extractor [s]
(->> s
(re-seq #"\w+")
;; Incanter bar chart of UFO Sighting Dates
(->> ufos
(map :date-time)
(map date-extractor)
(sort-by first) ;; date of the month
(barchart "Sighting Date" "Frequency"))
;; Frequency of sightings per month
;; Get all events for a year
;; per-month links at
(defn getlinks
"given a fetched page of links [d], select just the href"
(let [linkels (html/select d [:tr :td :a])]
(map #(get-in % [:attrs :href]) linkels)))
(defn urls-for-year
"given a year <yyyy> generate a list all url for that year"
(let [baseurl ""
d (getlinks (fetch-url (str baseurl "ndxevent.html")))
ff (filter #(= ys(subs % 4 8)) d)]
(map #(str baseurl %) ff)))
(urls-for-year "2014")
;; example
(def ufourls2014 (urls-for-year "2014"))
(first (ufos-from-url (first ufourls2014)))
;; get all the ufo data for a given year
(def ufoyear2015dat (mapcat ufos-from-url (urls-for-year "2015")))
(def ufoyear2014dat (mapcat ufos-from-url (urls-for-year "2014"))) ;; yikes long running!
;; graph it
(->> ufoyear2014dat
(map :date-time)
(map month-extractor)
(sort-by first) ;; date of the month
(barchart "Sighting Month" "Frequency"))
;; save it
(spit "ndxe201501-ufos.edn" (pr-str ufos))
;; Element Shape
;; {:tag :tr,
;; :attrs {:valign "TOP"},
;; :content
;; ("\n"
;; {:tag :td,
;; :attrs nil,
;; :content
;; ({:tag :font,
;; :attrs
;; {:color "#000000",
;; :face "Calibri",
;; :style "FONT-SIZE:11pt"},
;; :content
;; ({:tag :a,
;; :attrs {:href "ndxe020008.html"},
;; :content ("08/0200")})})}
;; Fequency of Words in Summary
;; tbd
Copy link

I like this example; thank you.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment