Skip to content

Instantly share code, notes, and snippets.

@mopemope
Last active December 21, 2015 18:59
Show Gist options
  • Save mopemope/6350782 to your computer and use it in GitHub Desktop.
Save mopemope/6350782 to your computer and use it in GitHub Desktop.
hentai4.me crawler example
(ns hentai.core
(:require
[clojure.core.async :as async :refer :all])
(:use
[hentai.crawler]
[clojure.tools.logging]))
(defn- zip [a b]
(map (fn [x y] (vector x y)) a b))
(defn- download-imgs [metadata]
(let [names (:img_name metadata)
n (count names)
cs (repeatedly n chan)]
(doseq [[img-url c] (zip names cs)]
(go (>! c (download-img metadata img-url))))
(dotimes [i n]
(let [[v c] (alts!! cs)]
(debug v)))
names))
(defn- get-hentai-metadata [link c]
(go
(>! c (-> (get-gallery-link link)
(get-img-id)
(get-img-metadata)))))
(defn- crawl-gallery [links]
(let [n (count links)
cs (repeatedly n chan)]
(doseq [[link c] (zip links cs)]
(get-hentai-metadata link c))
(dotimes [i n]
(let [[v c] (alts!! cs)]
(download-imgs v)))
"OK"))
(defn- crawl-hentai [start end]
(let [n (- end start)
cs (repeatedly n chan)]
(doseq [[idx c] (map-indexed vector cs)]
(go (>! c (get-hentai-links (slurp (format *page-url* (+ idx start)))))))
(dotimes [i n]
(let [[v c] (alts!! cs)]
(crawl-gallery v)))
(debug "fin")))
(crawl-hentai 1 10)
(ns hentai.crawler
(:require
[clj-http.client :as client]
[clojure.data.codec.base64 :as b64]
[clojure.data.json :as json]
[clojure.java.io :as io])
(:import
(java.net URLDecoder)
(java.util.regex Pattern)
(org.jsoup.nodes Element))
(:use
[clojure.tools.logging]
[jsoup.soup]))
(def ^:dynamic *base-url* "http://hentai4.me/")
(def ^:dynamic *page-url* "http://hentai4.me/page/%s")
(def ^:dynamic *metadata-url* "http://hentai4.me/ajax.php?dowork=getimg&id=%s&host=%s")
(def ^:dynamic *download-path* "/tmp/hentai")
(def ^:private img-id-re (Pattern/compile "/gallery-(\\d+)-(\\d+)" Pattern/DOTALL))
(def ^:private img-name-re (Pattern/compile "\\d+-hentai4.me-(\\d+).jpg" Pattern/DOTALL))
;;
;; Utility
;;
(defn get-img-id [url]
(let [[_ img-id host-id] (re-find (re-matcher img-id-re url))]
[img-id host-id]))
(defn- create-img-url [metadata img-name]
(format "http://%s.hdporn4.me/%s/%s" (:host metadata) (:folder_link metadata) img-name))
(defn- convert-seq-file [img-name]
(let [[_ ^String s] (re-find (re-matcher img-name-re img-name))]
; (println img-name)
(str (Integer. s) ".jpg") ))
(defn- create-filename [metadata img-name]
(let [parent (str (:folder_link metadata))
f (io/file *download-path* parent (convert-seq-file img-name))]
(.mkdirs (io/file *download-path* parent))
f))
;;
;; parse
;;
(defn get-hentai-links [data]
($ (parse data)
"a[rel=bookmark]"
(mapv #(.attr ^Element % "href"))))
(defn get-gallery-link [url]
(let [data (slurp url)]
(first
($ (parse data)
"a[title^=View Gallery]"
(mapv #(.attr ^Element % "href"))))))
(defn get-img-metadata [[img-id host-id]]
(let [
data (slurp (format *metadata-url* img-id, host-id))
json-str (String. ^bytes (b64/decode (.getBytes (URLDecoder/decode data "UTF-8")))) ]
(json/read-str json-str :key-fn keyword)))
(defn get-last-page []
(let [link (first
($ (parse (slurp *base-url*))
".last"
(mapv #(.attr ^Element % "href"))))]
(Integer. ^String (last (clojure.string/split link #"/")))))
;;
;; download
;;
(defn- write-file [path data]
(with-open [w (io/output-stream path)]
(.write w ^bytes data)))
(defn- get-img [url]
(let [h {"User-Agent" "Mozilla/5.0 (Windows NT 6.1;) Gecko/20100101 Firefox/13.0.1"}]
(:body (client/get url {:headers h :as :byte-array}))))
(defn download-img [metadata img-name]
(let [url (create-img-url metadata img-name)
path (create-filename metadata img-name)]
(write-file path (get-img url))
(debug (format "fin:%s" path))
path))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment