Created
April 30, 2009 04:55
-
-
Save talios/104267 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; Convert history of the internet printable wiki into PDF | |
; source from http://www.nethistory.co.nz/index.php/Main_Page | |
(use 'clojure.contrib.str-utils) | |
(import '(java.io ByteArrayInputStream FileOutputStream File) | |
'(javax.xml.parsers DocumentBuilderFactory)) | |
(load-file "clj_web_crawler.clj") | |
(System/setProperty "xr.text.aa-fontsize-threshhold" "10") | |
(defn replace-all | |
"A simple bulk regex replacement function." | |
[st reps] | |
(reduce #(.replaceAll %1 (first %2) (second %2)) st reps)) | |
(defn file-name-for [page] | |
(str "cache/" (replace-all page {"/" "_"}))) | |
(defn get-cached-or [page type get-uncached-content-with] | |
(let [page-file (str (file-name-for page) "-" type)] | |
(cond (.exists (File. page-file)) (do | |
(println "Reading cached page " page-file) (slurp page-file)) | |
:else (do | |
(let [content (get-uncached-content-with page)] | |
(.write (FileOutputStream. page-file) (.getBytes content "UTF-8")) | |
(println "Saving cached page to " page-file) | |
content | |
))))) | |
(defn get-page [page] | |
(replace-all | |
(clj-web-crawler/crawl-response "http://www.nethistory.co.nz" (str "/index.php/" page "?printable=yes")) | |
{"(\t|\n)" ""})) | |
(defn get-cached-page [page] | |
(get-cached-or page "raw" (fn [page] (get-page page)))) | |
(defn get-page-body [page] | |
(first (rest (re-split #"(.*<\s*body[^>]*>)|(<\s*/\s*body\s*\>.+)" (get-cached-page page))))) | |
(defn strip-crap [page] | |
(get-cached-or page "stripped" (fn [page] | |
(replace-all | |
(str (first (re-split #"<!-- NewPP limit report" (get-page-body page))) "</div></div></div></div>") | |
{ | |
" " " " | |
"<h3 id=\"site.*/h3>" "" | |
"<div id=\"jump-to-nav.*-->" "" | |
"<p>www.nethistory.co.nz</p>" "" | |
"<p>([A-Z'\\./\\s\\?!]{4,})([A-Z'\\./\\?!])" "<h3>$1</h3><p>$2" | |
})))) | |
(def title "Connecting the Clouds - The Internet in New Zealand") | |
(defn get-content [page] | |
(str "<div style='page-break-before: always;'></div>" (strip-crap page))) | |
(def content (str | |
"<html><head><title>" title "</title>" | |
"<style type='text/css'>" | |
(slurp "history.css") | |
"</style>" | |
"</head><body>" | |
; "<div id='header' style=''>" title "</div>" | |
; "<div id='footer' style=''>Page <span id='pagenumber'/> of <span id='pagecount'/></div>" | |
"<h1>" title "</h1>" | |
"<p>Written by Keith Newman</p>" | |
(get-content "Preface") | |
(get-content "Chapter_1_-_The_Tyranny_of_Distance_-_Reaching_out_to_the_World") | |
(get-content "Chapter_2_-_Battling_with_Big_Iron_-_Unscrambling_the_Code") | |
(get-content "Chapter_3_-_No.8_Wire_Networks_-_Patchwork_Quilt_of_Protocols") | |
(get-content "Chapter_4_-_Nuclear_Free_Reforms_-_Nothing_is_What_it_Seems") | |
(get-content "Chapter_5_-_Selling_the_Family_Jewels_-_Telecom_Holds_Back_the_Tide") | |
(get-content "Chapter_6_-_Craving_for_Connection_-_Dawn_of_the_Dial-up_Community") | |
(get-content "Chapter_7_-_Craving_for_Connection_II_-_The_Pioneering_ISPs") | |
(get-content "Chapter_8_-_The_Rhythm_Method_-_Regulation_by_Litigation") | |
(get-content "Chapter_9_-_Local_Loop_Languishing_-_Battling_Bandwidth_Blues") | |
(get-content "Chapter_10_-_Diminished_Capacity_-_Whose_Foot_is_on_the_Hose%3F") | |
(get-content "Chapter_11_-_Deluge_in_a_Paper_Cup_-_Knowledge_Wave_Wake-up") | |
(get-content "Chapter_12_-_E-government_Lumbers_Online_-_Presenting_a_Public_Face") | |
(get-content "Chapter_13_-_Clicks_and_Mortar_-_Beyond_Online_Pamphlets") | |
(get-content "Chapter_14_-_Battle_of_the_Names_-_Taming_the_Domains") | |
(get-content "Chapter_15_-_The_Proxy_Revolution_-_Changing_of_the_Guard") | |
(get-content "Chapter_16_-_Cyberspace_Junk_-_Nailing_Net_Nasties") | |
(get-content "Chapter_17_-_Bitstream_Boundaries_-_Sorting_out_Speed_Bumps") | |
(get-content "Chapter_18_-_Download_Culture_-_Infotainment_on-Demand") | |
(get-content "Chapter_19_-_Mobile_Momentum_-_Weaned_off_the_Wires") | |
(get-content "Chapter_20_-_Digital_Refresh_Required_-_Government_Learns_to_Share") | |
(get-content "Chapter_21_-_Broadband_Breakthrough_-_The_Battle_to_Unbundle") | |
(get-content "Chapter_22_-_In_the_Recovery_Room_-_Remedial_Learning_Required") | |
(get-content "Chapter_23_-_IP_Channel_Surfing_-_Digital_Vision_Evolving") | |
(get-content "Chapter_24_-_Leaping_the_Loop_-_Cloud_Cover_Continues") | |
(get-content "Chapter_25_-_Next_Step_Internet_-_High_Fibre_Diet_Required") | |
(get-content "Chapter_26_-_20/20_Visionaries_-_Beyond_the_Sevens_Cs") | |
(get-content "Footnotes") | |
(get-content "Acronyms_and_Explanations") | |
(get-content "Internet_in_New_Zealand_Timeline") | |
(get-content "Postscript_-_Holding_Back_the_Tide_-_Obstacles_to_Advancement") | |
"</body></html>" | |
)) | |
(def document-builder (.newDocumentBuilder (DocumentBuilderFactory/newInstance))) | |
(def mydoc (.parse document-builder (ByteArrayInputStream. (.getBytes content "UTF-8")))) | |
(doto (org.xhtmlrenderer.pdf.ITextRenderer.) | |
(.setDocument mydoc nil) | |
(.layout) | |
(.createPDF (FileOutputStream. "history.pdf"))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment