Skip to content

Instantly share code, notes, and snippets.

@varnie
Created August 4, 2011 16:09
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save varnie/1125528 to your computer and use it in GitHub Desktop.
Save varnie/1125528 to your computer and use it in GitHub Desktop.
Three ways of handling images from the net using Ruby
#!/usr/bin/env ruby
require 'open-uri'
require 'pool' #taken fron http://burgestrand.se/code/ruby-thread-pool/
IMG_REGEXP = /\<img[^(src)]*src=(\"|\')((http:\/\/)?[\w\.\/:-]+)\1[^\\>]*\\?\>/
VALID_IMAGE_EXTENSION_REGEXP = /.*\.(gif|tif|png|jpeg|jpg)$/
def image_extension_correct?(image)
not image.nil? and image =~ VALID_IMAGE_EXTENSION_REGEXP
end
def grab_images_by_hand(address)
result = []
content = open(address) do |f|
data = f.read
data.scan(IMG_REGEXP) do |img|
img_src = img[1]
next if not image_extension_correct?(img_src)
img_src = (address + img_src) unless img_src.start_with?('http://')
result << img_src
end
end
result
end
def grab_images_using_hpricot(address)
require 'rubygems'
require 'hpricot'
result = []
doc = Hpricot(open(address))
doc.search("img").each do |img|
img_src = img.attributes['src']
next if not image_extension_correct?(img_src)
img_src = (address + img_src) unless img_src.start_with?('http://')
result << img_src
end
result
end
def grab_images_using_nokogiri(address)
require 'rubygems'
require 'nokogiri'
result = []
doc = Nokogiri::HTML(open(address))
doc.xpath('//img').each do |node|
img_src = node.attribute('src').text
next if not image_extension_correct?(img_src)
img_src = (address + img_src) unless img_src.start_with?('http://')
result << img_src
end
result
end
def usage
puts "Usage: URL destination_folder"
exit 0
end
def process(url, dstFolder)
url.chop! if url.end_with?('/')
url = ('http://' + url) unless url.start_with?('http://')
images = grab_images_by_hand(url)
count = 0
pool = Pool.new(25)
images.each do |img_src|
pool.schedule do
print '.'
last_slash_index = img_src.rindex('/')
img_name = last_slash_index ? img_src[last_slash_index+1, img_src.length] : img_src
begin
img_body = open(img_src).read
File.open(File.join(dstFolder, img_name), 'wb') do |f|
f.puts img_body
count+= 1
end
rescue Exception => e
puts "an exception: #{e}"
end
end
end
at_exit{ pool.shutdown; puts "downloaded #{count} images into #{dstFolder} folder" }
end
########################################################################
if ARGV.length == 0
usage
exit 0
end
url = ARGV[0].chomp
dstFolder = ARGV[1].chomp
if url.empty? or dstFolder.empty?
usage
exit 0
end
if not File.exists?(dstFolder)
Dir.mkdir(dstFolder)
end
if not File.directory?(dstFolder)
puts '#{dstFolder} is not a folder'
else
process(url, dstFolder)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment