Skip to content

Instantly share code, notes, and snippets.

@talios
Created April 30, 2009 04:55
Show Gist options
  • Save talios/104267 to your computer and use it in GitHub Desktop.
Save talios/104267 to your computer and use it in GitHub Desktop.
; Convert history of the internet printable wiki into PDF
; source from http://www.nethistory.co.nz/index.php/Main_Page
(use 'clojure.contrib.str-utils)
(import '(java.io ByteArrayInputStream FileOutputStream File)
'(javax.xml.parsers DocumentBuilderFactory))
(load-file "clj_web_crawler.clj")
(System/setProperty "xr.text.aa-fontsize-threshhold" "10")
(defn replace-all
"A simple bulk regex replacement function."
[st reps]
(reduce #(.replaceAll %1 (first %2) (second %2)) st reps))
(defn file-name-for [page]
(str "cache/" (replace-all page {"/" "_"})))
(defn get-cached-or [page type get-uncached-content-with]
(let [page-file (str (file-name-for page) "-" type)]
(cond (.exists (File. page-file)) (do
(println "Reading cached page " page-file) (slurp page-file))
:else (do
(let [content (get-uncached-content-with page)]
(.write (FileOutputStream. page-file) (.getBytes content "UTF-8"))
(println "Saving cached page to " page-file)
content
)))))
(defn get-page [page]
(replace-all
(clj-web-crawler/crawl-response "http://www.nethistory.co.nz" (str "/index.php/" page "?printable=yes"))
{"(\t|\n)" ""}))
(defn get-cached-page [page]
(get-cached-or page "raw" (fn [page] (get-page page))))
(defn get-page-body [page]
(first (rest (re-split #"(.*<\s*body[^>]*>)|(<\s*/\s*body\s*\>.+)" (get-cached-page page)))))
(defn strip-crap [page]
(get-cached-or page "stripped" (fn [page]
(replace-all
(str (first (re-split #"<!-- NewPP limit report" (get-page-body page))) "</div></div></div></div>")
{
"&nbsp;" "&#160;"
"<h3 id=\"site.*/h3>" ""
"<div id=\"jump-to-nav.*-->" ""
"<p>www.nethistory.co.nz</p>" ""
"<p>([A-Z'\\./\\s\\?!]{4,})([A-Z'\\./\\?!])" "<h3>$1</h3><p>$2"
}))))
(def title "Connecting the Clouds - The Internet in New Zealand")
(defn get-content [page]
(str "<div style='page-break-before: always;'></div>" (strip-crap page)))
(def content (str
"<html><head><title>" title "</title>"
"<style type='text/css'>"
(slurp "history.css")
"</style>"
"</head><body>"
; "<div id='header' style=''>" title "</div>"
; "<div id='footer' style=''>Page <span id='pagenumber'/> of <span id='pagecount'/></div>"
"<h1>" title "</h1>"
"<p>Written by Keith Newman</p>"
(get-content "Preface")
(get-content "Chapter_1_-_The_Tyranny_of_Distance_-_Reaching_out_to_the_World")
(get-content "Chapter_2_-_Battling_with_Big_Iron_-_Unscrambling_the_Code")
(get-content "Chapter_3_-_No.8_Wire_Networks_-_Patchwork_Quilt_of_Protocols")
(get-content "Chapter_4_-_Nuclear_Free_Reforms_-_Nothing_is_What_it_Seems")
(get-content "Chapter_5_-_Selling_the_Family_Jewels_-_Telecom_Holds_Back_the_Tide")
(get-content "Chapter_6_-_Craving_for_Connection_-_Dawn_of_the_Dial-up_Community")
(get-content "Chapter_7_-_Craving_for_Connection_II_-_The_Pioneering_ISPs")
(get-content "Chapter_8_-_The_Rhythm_Method_-_Regulation_by_Litigation")
(get-content "Chapter_9_-_Local_Loop_Languishing_-_Battling_Bandwidth_Blues")
(get-content "Chapter_10_-_Diminished_Capacity_-_Whose_Foot_is_on_the_Hose%3F")
(get-content "Chapter_11_-_Deluge_in_a_Paper_Cup_-_Knowledge_Wave_Wake-up")
(get-content "Chapter_12_-_E-government_Lumbers_Online_-_Presenting_a_Public_Face")
(get-content "Chapter_13_-_Clicks_and_Mortar_-_Beyond_Online_Pamphlets")
(get-content "Chapter_14_-_Battle_of_the_Names_-_Taming_the_Domains")
(get-content "Chapter_15_-_The_Proxy_Revolution_-_Changing_of_the_Guard")
(get-content "Chapter_16_-_Cyberspace_Junk_-_Nailing_Net_Nasties")
(get-content "Chapter_17_-_Bitstream_Boundaries_-_Sorting_out_Speed_Bumps")
(get-content "Chapter_18_-_Download_Culture_-_Infotainment_on-Demand")
(get-content "Chapter_19_-_Mobile_Momentum_-_Weaned_off_the_Wires")
(get-content "Chapter_20_-_Digital_Refresh_Required_-_Government_Learns_to_Share")
(get-content "Chapter_21_-_Broadband_Breakthrough_-_The_Battle_to_Unbundle")
(get-content "Chapter_22_-_In_the_Recovery_Room_-_Remedial_Learning_Required")
(get-content "Chapter_23_-_IP_Channel_Surfing_-_Digital_Vision_Evolving")
(get-content "Chapter_24_-_Leaping_the_Loop_-_Cloud_Cover_Continues")
(get-content "Chapter_25_-_Next_Step_Internet_-_High_Fibre_Diet_Required")
(get-content "Chapter_26_-_20/20_Visionaries_-_Beyond_the_Sevens_Cs")
(get-content "Footnotes")
(get-content "Acronyms_and_Explanations")
(get-content "Internet_in_New_Zealand_Timeline")
(get-content "Postscript_-_Holding_Back_the_Tide_-_Obstacles_to_Advancement")
"</body></html>"
))
(def document-builder (.newDocumentBuilder (DocumentBuilderFactory/newInstance)))
(def mydoc (.parse document-builder (ByteArrayInputStream. (.getBytes content "UTF-8"))))
(doto (org.xhtmlrenderer.pdf.ITextRenderer.)
(.setDocument mydoc nil)
(.layout)
(.createPDF (FileOutputStream. "history.pdf")))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment