michaeleisel/scrape.rb

## scrape.rb
#!/usr/bin/ruby

require 'mechanize'
require 'parallel'

def should_follow_url(h)
  return [
    h.size > 0,
    !h.include?(".."),
    !h.include?("//"),
    !h.start_with?("mailto:"),
    !h.start_with?("/"),
    !h.end_with?(".html")
  ].reduce(&:&)
end

$to_fetch = []

def follow(agent, dir, url)
  puts url
  `mkdir #{dir}`
  links = agent.get(url).links.map { |l| l.href }.select do |h|
    should_follow_url(h)
  end.uniq
  dirs = links.select { |h| h.end_with?("/") }
  dirs.each do |h|
    next_dir = dir + h
    follow(agent, next_dir, url + h)
  end
  $to_fetch += (links - dirs).map do |h|
    [url + h, dir + h]
  end
end

agent = Mechanize.new
raise "Usage: ./scrape.rb <url to scrape>" unless ARGV.size == 1
url = ARGV[0]
dir = "/tmp/scrape-#{Random.new.rand(10000000)}/" # Dir.mktmpdir("scrape")
puts "Directory is #{dir}"
follow(agent, dir, url)
Parallel.each($to_fetch) do |u, d|
  `curl #{u} > #{d} 2> /dev/null`
end
	#!/usr/bin/ruby

	require 'mechanize'
	require 'parallel'

	def should_follow_url(h)
	return [
	h.size > 0,
	!h.include?(".."),
	!h.include?("//"),
	!h.start_with?("mailto:"),
	!h.start_with?("/"),
	!h.end_with?(".html")
	].reduce(&:&)
	end

	$to_fetch = []

	def follow(agent, dir, url)
	puts url
	`mkdir #{dir}`
	links = agent.get(url).links.map { \|l\| l.href }.select do \|h\|
	should_follow_url(h)
	end.uniq
	dirs = links.select { \|h\| h.end_with?("/") }
	dirs.each do \|h\|
	next_dir = dir + h
	follow(agent, next_dir, url + h)
	end
	$to_fetch += (links - dirs).map do \|h\|
	[url + h, dir + h]
	end
	end

	agent = Mechanize.new
	raise "Usage: ./scrape.rb <url to scrape>" unless ARGV.size == 1
	url = ARGV[0]
	dir = "/tmp/scrape-#{Random.new.rand(10000000)}/" # Dir.mktmpdir("scrape")
	puts "Directory is #{dir}"
	follow(agent, dir, url)
	Parallel.each($to_fetch) do \|u, d\|
	`curl #{u} > #{d} 2> /dev/null`
	end