Skip to content

Instantly share code, notes, and snippets.

@jmorton
Created January 15, 2017 02:15
Show Gist options
  • Save jmorton/236f740062359c50428bc6bfd0170044 to your computer and use it in GitHub Desktop.
Save jmorton/236f740062359c50428bc6bfd0170044 to your computer and use it in GitHub Desktop.
(ns scrape.core
(:require [clojure.edn :as edn]
[org.httpkit.client :as client]
[net.cgrand.enlive-html :as html]))
(defn get-links
"Get all links at URL as absolute URLs."
[url]
(->> ;; first get all <a href="">...</a> elements...
(-> (java.net.URL. url)
(html/html-resource)
(html/select [:a]))
;; ...then pluck out the href attributes...
(map :attrs)
(map :href)
;; ...and make an absolute URL.
(map #(str url %))))
(defn get-body
"GET body of URL after delay"
([url delay]
(Thread/sleep delay)
(-> url client/get deref :body slurp))
([url]
(get-body url 250)))
(defn base->batch
"Get links to batches of scenes."
[base-url]
(->> (get-links base-url)
(rest)))
(defn batch->scenes
"Get links to scene/md5 URL pairs."
[scene-list-url]
(->> (get-links scene-list-url)
(rest)
(partition 2)))
(defn scene->source
"Build a 'source' map for a scene/md5 URL pair."
[[checksum-url scene-url]]
(let [content (get-body checksum-url)
[_ checksum scene-id] (re-matches #"([\S]+) ([\S]+)" content)]
{:id scene-id
:url scene-url
:checksum checksum}))
(comment
(def base-url "https://edclpdsftp.cr.usgs.gov/downloads/lcmap/sites/washington/")
(def batch-list (base->batch base-url))
(def source-sample (map scene->source (take 3 (batch->scenes (first batch-list))))
(spit "source-sample.edn" source-sample))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment