Skip to content

Instantly share code, notes, and snippets.

@sunng87
Last active August 15, 2018 13:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save sunng87/4432937 to your computer and use it in GitHub Desktop.
Save sunng87/4432937 to your computer and use it in GitHub Desktop.
ClojureScript to download pages from Disney Wiki. Target to nodejs.
(ns crawler.core
(:require [cljs.nodejs :as node]))
(def http (node/require "http"))
(def fs (node/require "fs"))
(def local-path-root "out/")
(.mkdir fs local-path-root (fn [e]))
(defn start-req [url res-fn]
(doto (.get http url res-fn)
(.on "error" (fn [e]
(println (.-message e))
(start-req url res-fn)))
(.end)))
(declare ^:dynamic *fd*)
(declare cb-fn)
(defn start-category-req* [category from-page finish-cb fd]
(let [url (format "http://disney.wikia.com/api.php?format=json&action=query&list=categorymembers&cmtitle=Category:%s" category)
url (if-not (nil? from-page)
(str url (format "&cmcontinue=%s" from-page)) url)
buf-list (atom [])]
(start-req url
(fn [res]
(.on res "data"
(fn [chunk]
(swap! buf-list conj chunk)))
(.on res "end"
(fn []
(let [all-text (apply str @buf-list)
json-data (.parse js/JSON all-text)
data (js->clj json-data)]
(binding [*fd* fd] (finish-cb data category)))))))))
(defn cb-fn [data category]
(let [pages (-> data (get "query") (get "categorymembers"))
next-page (-> data (get "query-continue")
(get "categorymembers")
(get "cmcontinue"))]
(doseq [p pages]
(let [data-line (js/Buffer. (str (get p "pageid")
"\t"
(get p "title")
"\n"))]
(.writeSync fs *fd* data-line 0 (.-length data-line))))
(if-not (or (nil? next-page) (not (zero? (.indexOf next-page "page"))))
(do
(println (str "do fetching next page: " next-page))
(start-category-req* category next-page cb-fn *fd*))
(.close fs *fd*))))
(defn start-category-req [category]
(.open fs (str local-path-root category ".cat") "w"
(fn [err fd]
(binding [*fd* fd]
(start-category-req* category nil cb-fn *fd*)))))
(defn start-page-req [category page-id page-name dcb]
(println (str "Downloading " page-name))
(let [url (format "http://disney.wikia.com/wiki/?action=render&curid=%s" page-id)]
(.open fs (str local-path-root category "/" (.replace page-name "/" "")) "w"
(fn [err local-fd]
(start-req url
(fn [res]
(.on res "data"
(fn [chunk]
(let [b (js/Buffer. chunk)
bl (.-length b)]
(.write fs local-fd b 0 bl))))
(.on res "end"
(fn []
(.close fs local-fd)
(println (str "Downloaded " page-name))
(dcb)))))))))
(defn dcb [category page-dir]
(when-not (empty? page-dir)
(let [[page-id page-name] (first page-dir)]
(start-page-req category page-id page-name (fn [] (dcb category (rest page-dir)))))))
(defn download-category-pages [category]
(let [page-dir-content (.readFileSync fs (str local-path-root category ".cat") "UTF-8")
page-dir (drop-last (map #(into [] (.split % "\t"))
(into [] (.split page-dir-content "\n"))))]
(dcb category page-dir)))
(defn starter []
(println "hello world")
#_(start-category-req "Disney_characters")
#_(start-category-req "Disney_franchises")
#_(start-category-req "Films")
#_(start-category-req "Television_series_by_Disney")
(download-category-pages "Disney_characters"))
(set! *main-cli-fn* starter)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment