Created
May 12, 2010 21:09
-
-
Save nakkaya/399127 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns scraper | |
(:use [clojure.contrib.duck-streams :only [spit]] | |
[clojure.contrib.seq-utils :only [partition-all]]) | |
(:import (java.net URL) | |
(java.io BufferedReader InputStreamReader FileReader))) | |
(def num-threads 20) | |
(defn scrape [url] | |
(try | |
(let [conn (URL. url)] | |
(with-open [stream (.openStream conn)] | |
(let [buf (BufferedReader. (InputStreamReader. stream))] | |
(spit (.toString (java.util.UUID/randomUUID)) | |
(apply str (line-seq buf)))))) | |
(catch Exception e nil))) | |
(defn process-urls [list] | |
(doseq [url list] | |
(scrape url))) | |
(defn run [f] | |
(let [list (line-seq (BufferedReader. (FileReader. f))) | |
url-per-thread (int (inc (/ (count list) num-threads)))] | |
(doseq [list (partition-all url-per-thread list)] | |
(future (process-urls list))))) | |
;;(run "url.list") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment