varnie/Ruby images downloader

## Ruby images downloader
#!/usr/bin/env ruby
require 'open-uri'
require 'pool' #taken fron http://burgestrand.se/code/ruby-thread-pool/

IMG_REGEXP = /\<img[^(src)]*src=(\"|\')((http:\/\/)?[\w\.\/:-]+)\1[^\\>]*\\?\>/
VALID_IMAGE_EXTENSION_REGEXP = /.*\.(gif|tif|png|jpeg|jpg)$/

def image_extension_correct?(image)
  not image.nil? and image =~ VALID_IMAGE_EXTENSION_REGEXP
end

def grab_images_by_hand(address)

  result = []
  content = open(address) do |f|
    data = f.read
    data.scan(IMG_REGEXP) do |img|
      img_src = img[1]
      next if not image_extension_correct?(img_src)
	  img_src = (address + img_src) unless img_src.start_with?('http://')
	  result << img_src
    end
  end

  result
end

def grab_images_using_hpricot(address)
  require 'rubygems'
  require 'hpricot'

  result = []
  doc = Hpricot(open(address))
  doc.search("img").each do |img|
    img_src = img.attributes['src']
    next if not image_extension_correct?(img_src)
    img_src = (address + img_src) unless img_src.start_with?('http://')
	result << img_src
  end

  result
end

def grab_images_using_nokogiri(address)
  require 'rubygems'
  require 'nokogiri'

  result = []
  doc = Nokogiri::HTML(open(address))
  doc.xpath('//img').each do |node|
    img_src = node.attribute('src').text
    next if not image_extension_correct?(img_src)
    img_src = (address + img_src) unless img_src.start_with?('http://')
    result << img_src
  end

  result
end

def usage
  puts "Usage: URL destination_folder"
  exit 0
end

def process(url, dstFolder)
  url.chop! if url.end_with?('/')
  url = ('http://' + url) unless url.start_with?('http://')
  images = grab_images_by_hand(url)

  count = 0
  pool = Pool.new(25)
  images.each do |img_src|
    pool.schedule do
		print '.'
		last_slash_index = img_src.rindex('/')
		img_name = last_slash_index ? img_src[last_slash_index+1, img_src.length] : img_src

		begin
		  img_body = open(img_src).read
		  File.open(File.join(dstFolder, img_name), 'wb') do |f|
			f.puts img_body
			count+= 1
	      end
		rescue Exception => e
		  puts "an exception: #{e}"
		end
	end
  end

  at_exit{ pool.shutdown; puts "downloaded #{count} images into #{dstFolder} folder" }
end

########################################################################
if ARGV.length == 0
  usage
  exit 0
end

url = ARGV[0].chomp
dstFolder = ARGV[1].chomp

if url.empty? or dstFolder.empty?
  usage
  exit 0
end

if not File.exists?(dstFolder)
  Dir.mkdir(dstFolder)
end

if not File.directory?(dstFolder)
  puts '#{dstFolder} is not a folder'
else
  process(url, dstFolder)
end
	#!/usr/bin/env ruby
	require 'open-uri'
	require 'pool' #taken fron http://burgestrand.se/code/ruby-thread-pool/

	IMG_REGEXP = /\<img[^(src)]src=(\"\|\')((http:\/\/)?[\w\.\/:-]+)\1[^\\>]\\?\>/
	VALID_IMAGE_EXTENSION_REGEXP = /.*\.(gif\|tif\|png\|jpeg\|jpg)$/

	def image_extension_correct?(image)
	not image.nil? and image =~ VALID_IMAGE_EXTENSION_REGEXP
	end

	def grab_images_by_hand(address)

	result = []
	content = open(address) do \|f\|
	data = f.read
	data.scan(IMG_REGEXP) do \|img\|
	img_src = img[1]
	next if not image_extension_correct?(img_src)
	img_src = (address + img_src) unless img_src.start_with?('http://')
	result << img_src
	end
	end

	result
	end

	def grab_images_using_hpricot(address)
	require 'rubygems'
	require 'hpricot'

	result = []
	doc = Hpricot(open(address))
	doc.search("img").each do \|img\|
	img_src = img.attributes['src']
	next if not image_extension_correct?(img_src)
	img_src = (address + img_src) unless img_src.start_with?('http://')
	result << img_src
	end

	result
	end

	def grab_images_using_nokogiri(address)
	require 'rubygems'
	require 'nokogiri'

	result = []
	doc = Nokogiri::HTML(open(address))
	doc.xpath('//img').each do \|node\|
	img_src = node.attribute('src').text
	next if not image_extension_correct?(img_src)
	img_src = (address + img_src) unless img_src.start_with?('http://')
	result << img_src
	end

	result
	end

	def usage
	puts "Usage: URL destination_folder"
	exit 0
	end

	def process(url, dstFolder)
	url.chop! if url.end_with?('/')
	url = ('http://' + url) unless url.start_with?('http://')
	images = grab_images_by_hand(url)

	count = 0
	pool = Pool.new(25)
	images.each do \|img_src\|
	pool.schedule do
	print '.'
	last_slash_index = img_src.rindex('/')
	img_name = last_slash_index ? img_src[last_slash_index+1, img_src.length] : img_src

	begin
	img_body = open(img_src).read
	File.open(File.join(dstFolder, img_name), 'wb') do \|f\|
	f.puts img_body
	count+= 1
	end
	rescue Exception => e
	puts "an exception: #{e}"
	end
	end
	end

	at_exit{ pool.shutdown; puts "downloaded #{count} images into #{dstFolder} folder" }
	end

	########################################################################
	if ARGV.length == 0
	usage
	exit 0
	end

	url = ARGV[0].chomp
	dstFolder = ARGV[1].chomp

	if url.empty? or dstFolder.empty?
	usage
	exit 0
	end

	if not File.exists?(dstFolder)
	Dir.mkdir(dstFolder)
	end

	if not File.directory?(dstFolder)
	puts '#{dstFolder} is not a folder'
	else
	process(url, dstFolder)
	end