Skip to content

Instantly share code, notes, and snippets.

@timvisher
Created April 8, 2014 01:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save timvisher/10082108 to your computer and use it in GitHub Desktop.
Save timvisher/10082108 to your computer and use it in GitHub Desktop.
(ns sicp-scraper.core
(:require [net.cgrand.enlive-html :as html]))
(def ^:dynamic *html-resources*
(map (fn [page-number]
(-> (str "/Users/tim/Dropbox/sicp/full-text/mitpress.mit.edu/sicp/full-text/book/book-Z-H-" page-number ".html")
clojure.java.io/file
html/html-resource))
(range 9 36)))
(defn node->anchor-title-tuple [node]
((juxt (fn [node] (last (.split (last (.split (get-in node [:attrs :href]) "#")) "%_toc_"))) html/text) node))
(def ^:dynamic *chapter-frag->title*
(let [toc-resource (-> "/Users/tim/Dropbox/sicp/full-text/mitpress.mit.edu/sicp/full-text/book/book-Z-H-4.html"
clojure.java.io/file
html/html-resource)]
(into {} (map node->anchor-title-tuple
(html/select toc-resource [[:a
(complement (html/attr-contains :href "Temp"))
(html/attr-contains :href "chap")]])))))
(def ^:dynamic *exercises-resource*
(html/html-resource (clojure.java.io/file (str "/Users/tim/Dropbox/sicp/full-text/mitpress.mit.edu/sicp/full-text/book/book-Z-H-37.html"))))
(defn interesting-exercise-nodes []
(html/select *exercises-resource* #{[[:a
(html/attr? :href)
(html/attr-contains :href "thm")]]}))
(def ^:dynamic *exercise-anchor->title*
(let [tmp (into {} (map node->anchor-title-tuple (interesting-exercise-nodes)))]
(zipmap (keys tmp) (map (fn [val] (str "Exercise " val)) (vals tmp)))))
(def ^:dynamic *master-paged-order-nodes*
(map (fn [html-resource]
(html/select html-resource #{[#{:h1 :h2 :h3 :h4}
[:a
(html/attr? :href)
(complement (html/attr-contains :href "Temp"))
#{(html/attr-contains :href "sec")
(html/attr-contains :href "chap")}]]
[[:a
(html/attr? :name)
(html/pred #((apply hash-set (keys *exercise-anchor->title*)) (get-in % [:attrs :name])))]]
[[:a
(html/has [(html/text-pred #(re-seq #"Chapter" %))])]]}))
*html-resources*))
(def ^:dynamic *section-anchors->headers*
(into {}
(map node->anchor-title-tuple
(reduce into [] (map (fn [order-node-page]
(filter (fn [node]
(get-in node [:attrs :href]))
order-node-page))
*master-paged-order-nodes*)))))
(def ^:dynamic *master-order*
(reduce into
[]
(map (fn [order-node-page]
(map (fn [node]
(last (.split (or (get-in node [:attrs :name]) (get-in node [:attrs :href])) "%_toc_")))
order-node-page))
*master-paged-order-nodes*)))
(defn bare-anchor-fragment [anchor-node]
(last (.split (or (get-in anchor-node [:attrs :href])
(get-in anchor-node [:attrs :name]))
"%_toc_")))
(defn anchor-node->link [page-number anchor-node]
(let [page-base-link (str "http://mitpress.mit.edu/sicp/full-text/book/book-Z-H-" page-number ".html")
anchor-fragment (bare-anchor-fragment anchor-node)]
(str page-base-link "#" anchor-fragment)))
(defn link->fragment-link-tuple [link]
[(last (.split link "#")) link])
(def *fragment->link*
(into {}
(map link->fragment-link-tuple
(reduce into []
(map (fn [order-node-page page-number]
(map (partial anchor-node->link page-number) order-node-page))
*master-paged-order-nodes*
(drop 9 (range)))))))
(defn title->org-header-marks [title]
(if (re-seq #"Exercise [0-9]+\.[0-9]+" title)
"***"
(apply str (repeat (+ 1 (count (re-seq #"\." title))) "*"))))
(def *org*
(map (fn [fragment]
(let [title (clojure.string/replace (or (*chapter-frag->title* fragment)
(*section-anchors->headers* fragment)
(*exercise-anchor->title* fragment))
#" +"
" ")]
(format "%s TODO [[%s][%s]]"
(title->org-header-marks title)
(*fragment->link* fragment)
title)))
*master-order*))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment