-
-
Save rightfold/806b816eeb73b7ecc474 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns errorcheck.core | |
(:require [clj-http.client :as client] | |
[clojure.edn :as edn] | |
[clojurewerkz.urly.core :as urly]) | |
(:gen-class)) | |
; TODO: Make not global. | |
(def cs (clj-http.cookies/cookie-store)) | |
(defn match? [regex str] | |
(boolean (re-find regex str))) | |
(defn find-urls [from body] | |
(let [matches (map #(nth % 3) (re-seq #"(src|href)=(\"|')(.+?)\2" body))] | |
(set (map #(urly/resolve from %) matches)))) | |
(defn get-links [url predicate] | |
(let [response (client/get url {:throw-exceptions false, :cookie-store cs}) | |
found-urls (find-urls url (:body response))] | |
(println (:status response) url) | |
(filter predicate found-urls))) | |
(defn crawl [queue had predicate] | |
(if (empty? queue) | |
nil | |
(let [links (get-links (first queue) | |
#(and (not (contains? had %)) | |
(match? predicate %)))] | |
(recur (concat (rest queue) links) | |
(into had links) | |
predicate)))) | |
(defn -main [seed predicate login-url login-params] | |
(client/post login-url {:form-params (edn/read-string login-params), :cookie-store cs}) | |
(crawl [seed] #{} (java.util.regex.Pattern/compile predicate))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment