Last active
August 29, 2015 14:03
-
-
Save mopemope/99246e54d019f349f47b to your computer and use it in GitHub Desktop.
Megami Watcher
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(defproject megami.clj "0.1.0-SNAPSHOT" | |
:description "FIXME: write description" | |
:url "http://example.com/FIXME" | |
:license {:name "Eclipse Public License" | |
:url "http://www.eclipse.org/legal/epl-v10.html"} | |
:dependencies [ | |
[org.clojure/clojure "1.6.0"] | |
[clojure-soup "0.0.1"] | |
[org.clojure/core.async "0.1.303.0-886421-alpha"] | |
[com.taoensso/timbre "3.2.1"] | |
[org.clojure/tools.cli "0.2.4"] | |
[clj-http "0.9.2"] | |
] | |
:main megami.watch) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns megami.watch | |
(import | |
(java.util.regex Pattern) | |
(org.jsoup.nodes Element)) | |
(:require | |
[clojure.java.io :as io] | |
[taoensso.timbre :as timbre] | |
[clojure.core.async :as async :refer [<! >! <!! >!! timeout chan alt! alts!! go]] | |
[clj-http.client :as client]) | |
(:use | |
[jsoup.soup] | |
[clojure.tools.cli :only [cli]] | |
)) | |
(timbre/refer-timbre) | |
(def ^:private url-re (Pattern/compile ".*/(\\d+)/.*" Pattern/DOTALL)) | |
(def ^:dynamic *output-path* "./data") | |
(def ^:private threads-url "http://kilauea.bbspink.com/megami/subback.html") | |
(def ^:private encoding "MS932") | |
(def ^:private title-re (Pattern/compile "\\d:\\s+(.*)\\s*\\((\\d+)\\)" Pattern/DOTALL)) | |
(def ^:private wait (* 1000 60 )) | |
(def ^:private cmt-re (Pattern/compile "([\\w-:/.]+\\.jpg)|([\\w:/.]+\\.jpeg)|([\\w:/.]*\\.png)|([\\w:/.]*imepita.jp/[\\w./]+)|([\\w:/.]*imepic.jp/[\\w./]+)|([\\w:/.]*i.imgur.com/[\\w./]+)" Pattern/DOTALL)) | |
(defn write-file [file data] | |
(with-open [w (io/output-stream file)] | |
(.write w data))) | |
(defn download [line url file] | |
(try | |
(let [res (client/get url {:as :byte-array}) | |
status (:status res) | |
^String ct (:Content-Type (:headers res)) | |
^String length (:Content-Length (:headers res)) | |
] | |
(when (and (= status 200) (not (.startsWith ct "text/html")) (not= length "503")) | |
(let [ext (str "." (last (clojure.string/split ct #"/")))] | |
(debug (:headers res)) | |
(infof "start download from %s" url) | |
(infof "download to %s" (str file ext)) | |
(write-file (str file ext) (:body res))))) | |
(catch Exception e | |
(do | |
(errorf "%s:%s" line url) | |
(error e ) | |
)))) | |
(defn- get-dat-url [url] | |
(let [[_ index] (re-find (re-matcher url-re url))] | |
(format "http://kilauea.bbspink.com/megami/dat/%s.dat" index))) | |
(defn- format-url [root ^String href] | |
(str root (subs href 0 (- (.length href) 3)))) | |
(defn- parse-int [s] | |
(Integer. ^String (re-find #"\d+" s))) | |
(defn- get-thread-info [title] | |
(let [[_ title res-count] (re-find (re-matcher title-re title))] | |
{:title title :res-count (parse-int res-count)})) | |
(defn- parse-thread-list [data] | |
(let [base-url (first ($ (parse data) | |
"base" | |
(attr "href")))] | |
(when base-url | |
($ (parse data) | |
"a[href$=50]" | |
(map #(assoc (get-thread-info (.text ^Element %)) | |
:url (format-url base-url (.attr ^Element % "href")))))))) | |
(defn- filename [url] | |
(last (clojure.string/split url #"/"))) | |
(defn- complement-url [^String url] | |
(cond | |
(.startsWith url "http://") url | |
(.startsWith url "ttp://") (str "h" url) | |
:else (str "http://" url))) | |
(defn- search-image [data] | |
(doseq [line (clojure.string/split (:comment data) #"\n") ] | |
(let [[m] (re-find (re-matcher cmt-re line))] | |
(when m | |
(let [url (complement-url (clojure.string/trim m)) | |
filename (filename url)] | |
(download line url (str *output-path* "/" filename)))) | |
)) | |
data) | |
(defn- create-dat-info [thread-info] | |
(let [title (:title thread-info) | |
url (:url thread-info)] | |
(fn [idx line] | |
(let [[hndl mailto date cmt] (clojure.string/split line #"<>") | |
cmt (clojure.string/replace cmt #"<br>" "\n") | |
data { | |
:title title | |
:url url | |
:no (inc idx) | |
:handle hndl | |
:mailto mailto | |
:date date | |
:comment cmt | |
}] | |
(search-image data))))) | |
(defn- parse-thread [thread-info] | |
(let [data (slurp (get-dat-url (:url thread-info)) :encoding encoding) | |
lines (clojure.string/split data #"\r\n|\n")] | |
(vec (map-indexed (create-dat-info thread-info) lines)))) | |
(defn- get-thread-list [] | |
(let [^String data (slurp threads-url :encoding encoding)] | |
(parse-thread-list data))) | |
(defn- watch-megami [] | |
(try | |
(while true | |
(let [ts (shuffle (get-thread-list))] | |
(doseq [thread-info ts] | |
(infof "start %s" thread-info) | |
(let [data (parse-thread thread-info)] | |
(alts!! [(timeout wait)])) | |
(infof "end %s" thread-info))) | |
(alts!! [(timeout wait)])) | |
(catch Exception e | |
(error e))) | |
"OK") | |
(defn start-megami-watch [] | |
(let [c (chan)] | |
(go (>! c (watch-megami))) | |
(let [[v ch] (alts!! [c])] | |
(infof "OK")))) | |
(defn -main [& args] | |
(let [[options args banner] (cli args | |
["-h" "--help" "Show help" :default false :flag true] | |
["-o" "--output-path" "Set download path" :default "./data"] | |
)] | |
(when (:help options) | |
(println banner) | |
(System/exit 0)) | |
(let [path (:output-path options)] | |
(binding [*output-path* path] | |
(start-megami-watch)) | |
))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment