Skip to content

Instantly share code, notes, and snippets.

@aitor
Created October 3, 2008 17:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aitor/14592 to your computer and use it in GitHub Desktop.
Save aitor/14592 to your computer and use it in GitHub Desktop.
require 'rubygems'
require 'hpricot'
require 'open-uri'
# el de Ara t. Howard no el de facets.
require 'openobject'
def scrap(element, id)
img = element.search("/img").first
paragraph = element.search("/p").first
image = oo {
def to_s
<<-eos
Photo : "#{self.title}" ##{self.id}
Source : #{self.src}
Tags : #{self.tags.join(', ')}
Description : #{self.description}
eos
end
}
image.id = id
image.src = img.attributes['src']
image.title = img.attributes['title']
image.tags = []
image.description = paragraph.inner_text
image
end
def download(image)
uri = URI.parse(image.src)
tmp_file = uri.path[/.*\/(.*)\z/,1]
open(tmp_file, "wb") { |file|
file.write(Net::HTTP.get_response(uri).read_body)
}
open(tmp_file + ".txt", "wb") { |file|
file.write(image.to_s)
}
end
4579.downto 1 do |id|
begin
doc = Hpricot(open("http://www.shorpy.com/node/#{id}?size=_original"))
image = scrap(doc.search("//div[@class='content']"), id)
doc.search("//div[@class='taxonomy']//li/a").each{|ele|
image.tags << ele.inner_text
}
puts image.to_s
download image
puts "---------------------"
rescue OpenURI::HTTPError => e
puts "Error handling photo number ##{id} connection: #{e}"
puts "---------------------"
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment