Last active
December 21, 2015 18:59
-
-
Save mopemope/6350782 to your computer and use it in GitHub Desktop.
hentai4.me crawler example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns hentai.core | |
(:require | |
[clojure.core.async :as async :refer :all]) | |
(:use | |
[hentai.crawler] | |
[clojure.tools.logging])) | |
(defn- zip [a b] | |
(map (fn [x y] (vector x y)) a b)) | |
(defn- download-imgs [metadata] | |
(let [names (:img_name metadata) | |
n (count names) | |
cs (repeatedly n chan)] | |
(doseq [[img-url c] (zip names cs)] | |
(go (>! c (download-img metadata img-url)))) | |
(dotimes [i n] | |
(let [[v c] (alts!! cs)] | |
(debug v))) | |
names)) | |
(defn- get-hentai-metadata [link c] | |
(go | |
(>! c (-> (get-gallery-link link) | |
(get-img-id) | |
(get-img-metadata))))) | |
(defn- crawl-gallery [links] | |
(let [n (count links) | |
cs (repeatedly n chan)] | |
(doseq [[link c] (zip links cs)] | |
(get-hentai-metadata link c)) | |
(dotimes [i n] | |
(let [[v c] (alts!! cs)] | |
(download-imgs v))) | |
"OK")) | |
(defn- crawl-hentai [start end] | |
(let [n (- end start) | |
cs (repeatedly n chan)] | |
(doseq [[idx c] (map-indexed vector cs)] | |
(go (>! c (get-hentai-links (slurp (format *page-url* (+ idx start))))))) | |
(dotimes [i n] | |
(let [[v c] (alts!! cs)] | |
(crawl-gallery v))) | |
(debug "fin"))) | |
(crawl-hentai 1 10) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns hentai.crawler | |
(:require | |
[clj-http.client :as client] | |
[clojure.data.codec.base64 :as b64] | |
[clojure.data.json :as json] | |
[clojure.java.io :as io]) | |
(:import | |
(java.net URLDecoder) | |
(java.util.regex Pattern) | |
(org.jsoup.nodes Element)) | |
(:use | |
[clojure.tools.logging] | |
[jsoup.soup])) | |
(def ^:dynamic *base-url* "http://hentai4.me/") | |
(def ^:dynamic *page-url* "http://hentai4.me/page/%s") | |
(def ^:dynamic *metadata-url* "http://hentai4.me/ajax.php?dowork=getimg&id=%s&host=%s") | |
(def ^:dynamic *download-path* "/tmp/hentai") | |
(def ^:private img-id-re (Pattern/compile "/gallery-(\\d+)-(\\d+)" Pattern/DOTALL)) | |
(def ^:private img-name-re (Pattern/compile "\\d+-hentai4.me-(\\d+).jpg" Pattern/DOTALL)) | |
;; | |
;; Utility | |
;; | |
(defn get-img-id [url] | |
(let [[_ img-id host-id] (re-find (re-matcher img-id-re url))] | |
[img-id host-id])) | |
(defn- create-img-url [metadata img-name] | |
(format "http://%s.hdporn4.me/%s/%s" (:host metadata) (:folder_link metadata) img-name)) | |
(defn- convert-seq-file [img-name] | |
(let [[_ ^String s] (re-find (re-matcher img-name-re img-name))] | |
; (println img-name) | |
(str (Integer. s) ".jpg") )) | |
(defn- create-filename [metadata img-name] | |
(let [parent (str (:folder_link metadata)) | |
f (io/file *download-path* parent (convert-seq-file img-name))] | |
(.mkdirs (io/file *download-path* parent)) | |
f)) | |
;; | |
;; parse | |
;; | |
(defn get-hentai-links [data] | |
($ (parse data) | |
"a[rel=bookmark]" | |
(mapv #(.attr ^Element % "href")))) | |
(defn get-gallery-link [url] | |
(let [data (slurp url)] | |
(first | |
($ (parse data) | |
"a[title^=View Gallery]" | |
(mapv #(.attr ^Element % "href")))))) | |
(defn get-img-metadata [[img-id host-id]] | |
(let [ | |
data (slurp (format *metadata-url* img-id, host-id)) | |
json-str (String. ^bytes (b64/decode (.getBytes (URLDecoder/decode data "UTF-8")))) ] | |
(json/read-str json-str :key-fn keyword))) | |
(defn get-last-page [] | |
(let [link (first | |
($ (parse (slurp *base-url*)) | |
".last" | |
(mapv #(.attr ^Element % "href"))))] | |
(Integer. ^String (last (clojure.string/split link #"/"))))) | |
;; | |
;; download | |
;; | |
(defn- write-file [path data] | |
(with-open [w (io/output-stream path)] | |
(.write w ^bytes data))) | |
(defn- get-img [url] | |
(let [h {"User-Agent" "Mozilla/5.0 (Windows NT 6.1;) Gecko/20100101 Firefox/13.0.1"}] | |
(:body (client/get url {:headers h :as :byte-array})))) | |
(defn download-img [metadata img-name] | |
(let [url (create-img-url metadata img-name) | |
path (create-filename metadata img-name)] | |
(write-file path (get-img url)) | |
(debug (format "fin:%s" path)) | |
path)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment