Skip to content

Instantly share code, notes, and snippets.

@taf2
Created October 7, 2009 17:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save taf2/204219 to your computer and use it in GitHub Desktop.
Save taf2/204219 to your computer and use it in GitHub Desktop.
require 'rubygems'
require 'hpricot'
require 'curb'
require 'uri'
require 'fileutils'
$URLDB = {}
def grab_page(url)
uri = URI.parse(url)
c = Curl::Easy.new(url)
c.perform
$URLDB[url] = true
dirname, filename = make_path(uri)
path = File.join(dirname,filename)
# if File.exist?(path)
# puts "cached: #{path}"
# else
# save the html
File.open(path, "wb"){|f| f << c.body_str }
# end
doc = Hpricot(c.body_str)
# images
(doc/:img).each do|img|
src = img['src']
if src
link = abs_url(uri, src)
grab_file(link)
end
end
# scripts
(doc/:script).each do|script|
src = script['src']
if src
link = abs_url(uri, src)
grab_file(link)
end
end
# styles
(doc/:link).each do|link|
href = link['href']
if href
l = abs_url(uri, href)
css_file = grab_file(l)
grab_css_images(css_file, URI.parse(l), l)
end
end
# links
(doc/:a).each do|anchor|
href = anchor['href']
if href
link = abs_url(uri, href)
if URI.parse(link).host == uri.host and !$URLDB[link]
puts "fetch link: #{link}"
grab_page(link)
end
end
end
end
def grab_css_images(file, uri, link)
css = File.read(file)
css.scan(/background:.*url\((.*)\)/) do|m|
if m
url = abs_url(uri, m.first)
grab_file(url)
end
end
end
def grab_file(url)
STDERR.print "url: #{url.inspect} -> "
dirname, filename = make_path(URI.parse(url))
print "#{dirname}/#{filename} "
path = File.join(dirname,filename)
if File.exist?(path)
puts ": cached"
else
Curl::Easy.download(url, path)
puts ": fetched"
end
path
end
def abs_url(uri,url)
if url.match(/^http:/)
url
elsif url.match(/^\/\//)
'http:' + url
elsif url.match(/^\//)
'http://' + uri.host + url
else
folder = File.dirname(uri.path)
# relative...
'http://' + uri.host + (folder + '/' + url).squeeze('/')
end
end
def make_path(uri)
dir = File.dirname(uri.path)
path = uri.host + dir
if !File.exist?(path) and !File.directory?(path)
STDERR.puts "create: #{path}"
FileUtils.mkdir_p(path)
end
filename = File.basename(uri.path)
if filename == '/'
[uri.host + dir, "index.html"]
else
[uri.host + dir, filename]
end
end
grab_page(ARGV[0])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment