Skip to content

Instantly share code, notes, and snippets.

@darkhelmet
Created November 2, 2009 04:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save darkhelmet/223929 to your computer and use it in GitHub Desktop.
Save darkhelmet/223929 to your computer and use it in GitHub Desktop.
main_links.each do |link|
dir = link.split('.').first # i get 'name' or 'name-2' to download into
begin
page = Hpricot(open("#{domain}/#{link}"))
links = page.search('a').select do |l|
l.attributes['href'] =~ /javascript:OpenLink\((.*)\)/
!($1 =~ /^\'\w+\.\w+\'$/)
end.map do |l|
l.attributes['href'].to_s =~ /javascript:OpenLink\(\'(.*)\'\)/
decode_url($1)
end.uniq
links.each do |l|
begin
page = Hpricot(open(l))
image = page.search('img').last.attributes['src']
host = link.split('/').first(3).join('/')
download_image(dir, host, image)
rescue Timeout::Error
print "Timeout on #{link}...\n"
rescue => e
print "Error on #{link}: {#{e}}...\n"
end
end
rescue Timeout::Error
print "Timeout on #{domain}/#{link}...\n"
rescue
print "Failed on #{domain}/#{link}...\n"
end
end
require 'socket'
def page_avail?(site)
uri = URI(site)
begin
TCPSocket.open(uri.host, uri.port) do |socket|
socket.puts "GET #{uri.path} HTTP/1.1\r\nHOST: 127.0.0.1\r\n\r\n"
content = socket.read
return content.match(%r{404 Message}).nil?
end
rescue => e
print "Error on #{site}: {#{e}}...\n"
end
end
main_links.dup.each do |l|
name = l.split('.').first # links are of the form "name.htm", and I want the name part
2.upto(20) do |i|
url = "#{domain}/#{name}-#{i}.htm"
main_links << "#{name}-#{i}.htm" if page_avail?(url)
sleep 2
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment