Skip to content

Instantly share code, notes, and snippets.

@nakkaya
Created May 12, 2010 21:09
Show Gist options
  • Save nakkaya/399127 to your computer and use it in GitHub Desktop.
Save nakkaya/399127 to your computer and use it in GitHub Desktop.
(ns scraper
(:use [clojure.contrib.duck-streams :only [spit]]
[clojure.contrib.seq-utils :only [partition-all]])
(:import (java.net URL)
(java.io BufferedReader InputStreamReader FileReader)))
(def num-threads 20)
(defn scrape [url]
(try
(let [conn (URL. url)]
(with-open [stream (.openStream conn)]
(let [buf (BufferedReader. (InputStreamReader. stream))]
(spit (.toString (java.util.UUID/randomUUID))
(apply str (line-seq buf))))))
(catch Exception e nil)))
(defn process-urls [list]
(doseq [url list]
(scrape url)))
(defn run [f]
(let [list (line-seq (BufferedReader. (FileReader. f)))
url-per-thread (int (inc (/ (count list) num-threads)))]
(doseq [list (partition-all url-per-thread list)]
(future (process-urls list)))))
;;(run "url.list")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment