Skip to content

Instantly share code, notes, and snippets.

@yhsiang
Created November 12, 2015 13:19
Show Gist options
  • Save yhsiang/9520fc70e4f66fad83dc to your computer and use it in GitHub Desktop.
Save yhsiang/9520fc70e4f66fad83dc to your computer and use it in GitHub Desktop.
parse all image from http://public.318.io/
(ns pic-parser.core
(:require [clj-http.client :as client]
[net.cgrand.enlive-html :as html]
[clojure.data.json :as json]))
(def target-url "http://public.318.io/search?viewmode=grid")
(def img-base "http://public.318.io/sites/318_public/files/styles/large/public/digicoll/public/010/")
(defn get-body
"return body content"
[url]
(:body (client/get url)))
(defn parse-pagination
"return total page number"
[]
(let [nav (html/select
(html/html-snippet (get-body target-url))
[[:div (html/attr= :id "thenav")]])]
(Integer. (last (re-find #"共(\d+)頁"
(last (:content (last nav))))))))
(defn gen-numbers
"return a list like [0 20 40 ... 7820]"
[]
(take (parse-pagination) (iterate #(+ 20 %) 0)))
(defn gen-urls
"return a list or url like http://public.318.io/search?viewmode=grid&row=.."
[]
(map #(str target-url "&row=" %) (gen-numbers)))
(defn parse-urls-in-page
"return a list or url like http://public.318.io/1001"
[url]
(let [titles (html/select
(html/html-snippet (get-body url))
[:div.featuredimage :div.image-title])]
(map #(str "http://public.318.io/" (first (:content %))) titles)))
(defn get-thumbs
"return a list of image-url like http://public.318.io/sites/318_public/files/styles/large/public/digicoll/public/010/1001_001.jpg"
[url]
(let [icons (html/select
(html/html-snippet (get-body url))
[:div.icons :ul :li])]
(map #(str img-base (:data-key (:attrs %)) ".jpg") icons)))
(defn get-all
"return all image urls"
[]
(flatten (map get-thumbs
(flatten (map parse-urls-in-page
(gen-urls))))))
(defn -main
"print all image urls"
[]
(println (clojure.string/join "\n" (get-all))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment