aitor (owner)

Revisions

gist: 14592 Download_button fork
public
Public Clone URL: git://gist.github.com/14592.git
Embed All Files: show embed
Text #
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
require 'rubygems'
require 'hpricot'
require 'open-uri'
 
# el de Ara t. Howard no el de facets.
require 'openobject'
 
def scrap(element, id)
  img = element.search("/img").first
  paragraph = element.search("/p").first
  
  image = oo {
    def to_s
      <<-eos
        Photo : "#{self.title}" ##{self.id}
        Source : #{self.src}
        Tags : #{self.tags.join(', ')}
        Description : #{self.description}
      eos
    end
    
  }
  
  image.id = id
  image.src = img.attributes['src']
  image.title = img.attributes['title']
  image.tags = []
  image.description = paragraph.inner_text
  image
end
 
 
def download(image)
  uri = URI.parse(image.src)
  tmp_file = uri.path[/.*\/(.*)\z/,1]
 
  open(tmp_file, "wb") { |file|
    file.write(Net::HTTP.get_response(uri).read_body)
  }
 
  open(tmp_file + ".txt", "wb") { |file|
    file.write(image.to_s)
  }
end
 
 
4579.downto 1 do |id|
  begin
    doc = Hpricot(open("http://www.shorpy.com/node/#{id}?size=_original"))
    image = scrap(doc.search("//div[@class='content']"), id)
    doc.search("//div[@class='taxonomy']//li/a").each{|ele|
      image.tags << ele.inner_text
    }
    puts image.to_s
    download image
    puts "---------------------"
  rescue OpenURI::HTTPError => e
    puts "Error handling photo number ##{id} connection: #{e}"
    puts "---------------------"
    
  end
end