Page scraper
(ns clj.image-db-clj.ingestor | |
(:use net.cgrand.enlive-html) | |
(:import java.net.URL)) | |
(defn parse_page | |
"Retrieve all link references from the specified url" | |
[url] | |
(-> url URL. html-resource select [:a] )) | |
(defn folder-filter [anchor-tag] | |
(nil? (re-seq #"^\[|\.\w{2,4}$" (-> anchor-tag :content first)))) | |
(defn image-filter [anchor-tag] | |
(identity (re-seq #"\.(jpg|gif|png|mng|jpeg|bmp)$" (-> anchor-tag :content first)))) | |
(defn scrape-and-generate-tags [url tags stored] | |
(let [a-tags (filter folder-filter (parse-page url))] | |
(println a-tags) | |
(if (empty? a-tags) | |
tags | |
(do (println "Tag: " (-> a-tags first :content)) | |
(recur (apply str url "/" (-> a-tags first :content)) (concat tags (-> a-tags first :content)) nil))))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment