Skip to content

Instantly share code, notes, and snippets.

@michaeleisel
Created January 12, 2019 20:25
Show Gist options
  • Save michaeleisel/a5087ba938e794d67bf1a50098f968cc to your computer and use it in GitHub Desktop.
Save michaeleisel/a5087ba938e794d67bf1a50098f968cc to your computer and use it in GitHub Desktop.
#!/usr/bin/ruby
require 'mechanize'
require 'parallel'
def should_follow_url(h)
return [
h.size > 0,
!h.include?(".."),
!h.include?("//"),
!h.start_with?("mailto:"),
!h.start_with?("/"),
!h.end_with?(".html")
].reduce(&:&)
end
$to_fetch = []
def follow(agent, dir, url)
puts url
`mkdir #{dir}`
links = agent.get(url).links.map { |l| l.href }.select do |h|
should_follow_url(h)
end.uniq
dirs = links.select { |h| h.end_with?("/") }
dirs.each do |h|
next_dir = dir + h
follow(agent, next_dir, url + h)
end
$to_fetch += (links - dirs).map do |h|
[url + h, dir + h]
end
end
agent = Mechanize.new
raise "Usage: ./scrape.rb <url to scrape>" unless ARGV.size == 1
url = ARGV[0]
dir = "/tmp/scrape-#{Random.new.rand(10000000)}/" # Dir.mktmpdir("scrape")
puts "Directory is #{dir}"
follow(agent, dir, url)
Parallel.each($to_fetch) do |u, d|
`curl #{u} > #{d} 2> /dev/null`
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment