Skip to content

Instantly share code, notes, and snippets.

@ato
Last active August 29, 2015 14:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ato/651a74f7ca60d378ecde to your computer and use it in GitHub Desktop.
Save ato/651a74f7ca60d378ecde to your computer and use it in GitHub Desktop.
;; Not useful as a generic HTTrack conversion tool as we don't bother trying to undo the URL-rewriting. PANDORA
;; crawls are often manually edited and sometimes collected with tools other than HTTrack.
;; Instead we just generate records with the URLs as they deliver in the PANDORA archive:
;; http://pandora.nla.gov.au/pan/...
;;
;; Example output:
;;
;; WARC/1.0
;; WARC-Type: resource
;; WARC-Target-URI: http://pandora.nla.gov.au/pan/85187/20080605-1425/www.tams.act.gov.au/__data/assets/pdf_file/0010/102250/Alcohol_and_Drugs_discussion_paper.pdf
;; WARC-Record-ID: <urn:uuid:0a645b90-52b4-4e31-b986-039a25dd60e4>
;; WARC-Date: 2008-06-05T04:39:04Z
;; WARC-Block-Digest: sha1:2LRPFHP4JIOS37Q3YHK7VG7SMZP2ORE4
;; Content-Type: application/pdf
;; Content-Length: 234249
(ns clj-warc.pandora2warc
"PANDORA display instance to WARC converter."
(:use clojure.java.io)
(:require [clojure.string :as str]
[clojure.contrib.seq-utils :as seq-utils])
(:import (org.archive.util Base32)
(java.nio.file Path Files SimpleFileVisitor FileVisitResult)
(java.nio.file.attribute FileTime BasicFileAttributes))
(:gen-class))
(def ^java.text.SimpleDateFormat ^:dynamic date-fmt nil)
(defn new-date-fmt []
(doto (java.text.SimpleDateFormat. "yyyy-MM-dd'T'HH:mm:ss'Z'")
(.setCalendar (java.util.Calendar/getInstance
(java.util.TimeZone/getTimeZone "UTC")))))
(defn decode-mime-line
"Decode a line of a mime.types file. Returns a seq of [extension mime]
pairs."
[s]
(let [[mime & extensions] (-> s (str/replace #"#.*" "") ; remove comments
(str/split #"\s+"))]
(for [ext extensions] [ext mime])))
(defn load-mimes
"Construct a hash map of file extension to mime-type."
[]
(let [lines (-> (clojure.lang.RT/baseLoader)
(.getResourceAsStream "mime.types")
(reader)
(line-seq))]
(into {} (mapcat decode-mime-line lines))))
(defn read-panaccess
"Read a .panacess-mime.types file and return a map of filenames
to mime-types."
[f]
(let [f (file f)]
(when (.exists f)
(->> (reader f)
(line-seq)
(filter #(re-matches #".*(<Files|ForceType) .*" %))
(partition 2)
(map (fn [[files ft]]
[(second (re-matches #"\s*<Files\s*\"([^\"]+)\">\s*" files))
(second (re-matches #"\s*ForceType\s+(\S*)\s*" ft))]))
(into {})))))
(let [cache (atom nil)]
(defn read-panaccess-cache
"Just like read-panaccess but remembers the last file we read.
Dumb cache for tired brain."
[f]
(let [[fc value] @cache]
(if (= fc f)
value
(second (reset! cache [f (read-panaccess f)]))))))
(def mime-types (load-mimes))
(defn mime-for-file [^java.io.File f]
(let [ext (last (str/split (str f) #"\."))
pamt (file (.getParent f) ".panaccess-mime.types")
pa-mime (get (read-panaccess-cache pamt) (.getName f))]
(if pa-mime
pa-mime
(get mime-types ext "application/octet-stream"))))
(defn symlink? [^java.io.File f]
(let [canon (if (.getParent f)
(file (-> f (.getParentFile) (.getCanonicalFile))
(.getName f))
file)]
(not= (.getCanonicalFile canon) (.getAbsoluteFile canon))))
(defn safe-file-seq
"A tree seq on java.io.Files, doesn't branch on symbolic links."
[dir]
(tree-seq
(fn [^java.io.File f] (and (not (symlink? f))) (.isDirectory f))
(fn [^java.io.File d] (seq (.listFiles d)))
dir))
(defn b32sha1 [^java.security.MessageDigest md]
(str "sha1:" (Base32/encode (.digest md))))
(defn sha1-file [f]
(let [md (java.security.MessageDigest/getInstance "SHA1")
buffer (byte-array (* 1024 1024))]
(with-open [is (java.io.FileInputStream. (file f))]
(while
(let [length (.read is buffer)]
(when (pos? length)
(.update md buffer 0 length)
true))))
(b32sha1 md)))
(defn make-header [f #^java.util.Date cdate prefix-size]
(try
(let [f (file f)
path (subs (str f) prefix-size)
encoded-path (str/replace (java.net.URLEncoder/encode path)
#"%2F" "/")]
(str "WARC/1.0\r\n"
"WARC-Type: resource\r\n"
"WARC-Target-URI: http://pandora.nla.gov.au/pan/" encoded-path "\r\n"
"WARC-Record-ID: <urn:uuid:" (java.util.UUID/randomUUID) ">\r\n"
"WARC-Date: " (.format date-fmt cdate) "\r\n"
"WARC-Block-Digest: " (sha1-file f) "\r\n"
"Content-Type: " (mime-for-file f) "\r\n"
"Content-Length: " (.length f) "\r\n"
"\r\n"))
(catch Exception e
(println "XXX " cdate)
(throw e))))
(defn filetime-to-date [^FileTime ft]
(java.util.Date. (.toMillis ft)))
(defn happy-find
"Walk directory tree returning a list of [file creation-date] pairs."
[dir]
(let [path (.toPath (file dir))]
(seq-utils/fill-queue
(fn [fill]
(java.nio.file.Files/walkFileTree
path (proxy [java.nio.file.SimpleFileVisitor] []
(visitFile [^Path file ^BasicFileAttributes attribs]
(fill [(.toFile file) (filetime-to-date (.creationTime attribs))])
FileVisitResult/CONTINUE)))))))
(defn valid-file? [^java.io.File f]
(println f)
(println (.exists f))
(and (.exists f)
(not (.isDirectory f))
(not (re-matches #"\.panaccess.*" (.getName f)))))
(defn ^java.io.FileOutputStream renaming-output [file]
(let [tmp-file (java.io.File. (str file ".new"))
fos (proxy [java.io.FileOutputStream] [tmp-file]
(close [] (proxy-super close)
(.renameTo tmp-file file)))]
(.deleteOnExit tmp-file)
fos))
(defn write-record [path ctime prefix-size warc]
(copy (make-header path ctime prefix-size) warc)
(with-open [is (java.io.FileInputStream. ^java.io.File path)]
(copy is warc))
(copy "\r\n\r\n" warc))
(defn convert-instance [^java.io.File dest ^java.io.File dir]
(binding [date-fmt (new-date-fmt)]
(let [pi (-> dir (.getParentFile) (.getName))
timestamp (.getName dir)
prefix-size (inc (count (.getParent (.getParentFile dir))))
outfile (file dest (str "nla.arc-" pi "-" timestamp ".warc"))]
(with-open [warc (renaming-output outfile)]
(doseq [[path ctime] (happy-find dir)
:when (valid-file? path)]
(write-record path ctime prefix-size warc))))))
;;(convert-instance (file "/var/tmp") (file "/pandoraworking/working/102582/20101025-0038"))
(defn -main [dest & args]
(dorun (pmap #(convert-instance (file dest) (file %)) args))
(shutdown-agents))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment