Created
March 21, 2020 16:27
-
-
Save jackrusher/31d1ffffe0dcc522add604c6d462145b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns appliedsciencestudio.covid19-clj-viz.repl | |
(:require [clojure.string :as string] | |
[hickory.core :as hick] | |
[hickory.select :as s])) | |
;;;; Scraping data | |
(def worldometers-page | |
"We want this data, but it's only published as HTML." | |
(-> (slurp "https://www.worldometers.info/coronavirus/") | |
hick/parse | |
hick/as-hickory)) | |
(defn deepest-content | |
"Drill down to the deepest content node." | |
[node] | |
(if-let [content (or (:content node) (:content (first node)))] | |
(deepest-content content) | |
(cond (vector? node) (apply str (filter string? node)) | |
(map? node) nil | |
:else node))) | |
(def headers | |
(->> (s/select (s/tag :thead) worldometers-page) | |
first | |
(s/select (s/tag :tr)) | |
first | |
(s/select (s/tag :th)) | |
(map deepest-content))) | |
(def dataset | |
(->> (s/select (s/tag :tbody) worldometers-page) | |
first | |
(s/select (s/tag :tr)) | |
(map (fn [row] | |
(zipmap headers (map deepest-content (s/select (s/tag :td) row))))))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment