Skip to content

Instantly share code, notes, and snippets.

@mopemope mopemope/core.clj
Last active Dec 21, 2015

What would you like to do? crawler example
(ns hentai.core
[clojure.core.async :as async :refer :all])
(defn- zip [a b]
(map (fn [x y] (vector x y)) a b))
(defn- download-imgs [metadata]
(let [names (:img_name metadata)
n (count names)
cs (repeatedly n chan)]
(doseq [[img-url c] (zip names cs)]
(go (>! c (download-img metadata img-url))))
(dotimes [i n]
(let [[v c] (alts!! cs)]
(debug v)))
(defn- get-hentai-metadata [link c]
(>! c (-> (get-gallery-link link)
(defn- crawl-gallery [links]
(let [n (count links)
cs (repeatedly n chan)]
(doseq [[link c] (zip links cs)]
(get-hentai-metadata link c))
(dotimes [i n]
(let [[v c] (alts!! cs)]
(download-imgs v)))
(defn- crawl-hentai [start end]
(let [n (- end start)
cs (repeatedly n chan)]
(doseq [[idx c] (map-indexed vector cs)]
(go (>! c (get-hentai-links (slurp (format *page-url* (+ idx start)))))))
(dotimes [i n]
(let [[v c] (alts!! cs)]
(crawl-gallery v)))
(debug "fin")))
(crawl-hentai 1 10)
(ns hentai.crawler
[clj-http.client :as client]
[ :as b64]
[ :as json]
[ :as io])
( URLDecoder)
(java.util.regex Pattern)
(org.jsoup.nodes Element))
(def ^:dynamic *base-url* "")
(def ^:dynamic *page-url* "")
(def ^:dynamic *metadata-url* "")
(def ^:dynamic *download-path* "/tmp/hentai")
(def ^:private img-id-re (Pattern/compile "/gallery-(\\d+)-(\\d+)" Pattern/DOTALL))
(def ^:private img-name-re (Pattern/compile "\\\\d+).jpg" Pattern/DOTALL))
;; Utility
(defn get-img-id [url]
(let [[_ img-id host-id] (re-find (re-matcher img-id-re url))]
[img-id host-id]))
(defn- create-img-url [metadata img-name]
(format "" (:host metadata) (:folder_link metadata) img-name))
(defn- convert-seq-file [img-name]
(let [[_ ^String s] (re-find (re-matcher img-name-re img-name))]
; (println img-name)
(str (Integer. s) ".jpg") ))
(defn- create-filename [metadata img-name]
(let [parent (str (:folder_link metadata))
f (io/file *download-path* parent (convert-seq-file img-name))]
(.mkdirs (io/file *download-path* parent))
;; parse
(defn get-hentai-links [data]
($ (parse data)
(mapv #(.attr ^Element % "href"))))
(defn get-gallery-link [url]
(let [data (slurp url)]
($ (parse data)
"a[title^=View Gallery]"
(mapv #(.attr ^Element % "href"))))))
(defn get-img-metadata [[img-id host-id]]
(let [
data (slurp (format *metadata-url* img-id, host-id))
json-str (String. ^bytes (b64/decode (.getBytes (URLDecoder/decode data "UTF-8")))) ]
(json/read-str json-str :key-fn keyword)))
(defn get-last-page []
(let [link (first
($ (parse (slurp *base-url*))
(mapv #(.attr ^Element % "href"))))]
(Integer. ^String (last (clojure.string/split link #"/")))))
;; download
(defn- write-file [path data]
(with-open [w (io/output-stream path)]
(.write w ^bytes data)))
(defn- get-img [url]
(let [h {"User-Agent" "Mozilla/5.0 (Windows NT 6.1;) Gecko/20100101 Firefox/13.0.1"}]
(:body (client/get url {:headers h :as :byte-array}))))
(defn download-img [metadata img-name]
(let [url (create-img-url metadata img-name)
path (create-filename metadata img-name)]
(write-file path (get-img url))
(debug (format "fin:%s" path))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.