eschulte/scraper.clj

## scraper.clj
;; from Nurullah Akkaya's gist at http://gist.github.com/399127
(ns scraper
  (:use [clojure.contrib.duck-streams :only [spit]]
	[clojure.contrib.seq-utils :only [partition-all]])
  (:import (java.net URL)
	   (java.io BufferedReader InputStreamReader FileReader)
           java.util.Date java.text.SimpleDateFormat))

(defn scrape [url]
  (try
   (let [conn (URL. url)]
     (with-open [stream (.openStream conn)]
       (let [buf (BufferedReader. (InputStreamReader. stream))]
	 (spit (.toString (java.util.UUID/randomUUID))
	       (apply str (line-seq buf))))))
   (catch Exception e nil)))

(comment ;; running under a single pmap
  (let [start (.getTime (Date.))
        whole (future (dorun (pmap scrape (line-seq (BufferedReader. (FileReader. "url.list"))))))]
    (deref whole)
    (println (- (.getTime (Date.)) start)))
  )

(comment ;; running with two concurrent pmaps
  (let [start (.getTime (Date.))
        urls (line-seq (BufferedReader. (FileReader. "url.list")))
        half (/ (count urls) 2)
        first-half (future (dorun (pmap scrape (take half urls))))
        second-half (future (dorun (pmap scrape (drop half urls))))]
    (deref first-half) (deref second-half)
    (println (- (.getTime (Date.)) start)))
  )
	;; from Nurullah Akkaya's gist at http://gist.github.com/399127
	(ns scraper
	(:use [clojure.contrib.duck-streams :only [spit]]
	[clojure.contrib.seq-utils :only [partition-all]])
	(:import (java.net URL)
	(java.io BufferedReader InputStreamReader FileReader)
	java.util.Date java.text.SimpleDateFormat))

	(defn scrape [url]
	(try
	(let [conn (URL. url)]
	(with-open [stream (.openStream conn)]
	(let [buf (BufferedReader. (InputStreamReader. stream))]
	(spit (.toString (java.util.UUID/randomUUID))
	(apply str (line-seq buf))))))
	(catch Exception e nil)))

	(comment ;; running under a single pmap
	(let [start (.getTime (Date.))
	whole (future (dorun (pmap scrape (line-seq (BufferedReader. (FileReader. "url.list"))))))]
	(deref whole)
	(println (- (.getTime (Date.)) start)))
	)

	(comment ;; running with two concurrent pmaps
	(let [start (.getTime (Date.))
	urls (line-seq (BufferedReader. (FileReader. "url.list")))
	half (/ (count urls) 2)
	first-half (future (dorun (pmap scrape (take half urls))))
	second-half (future (dorun (pmap scrape (drop half urls))))]
	(deref first-half) (deref second-half)
	(println (- (.getTime (Date.)) start)))
	)