rtanglao/downloadOriginals.rb

## downloadOriginals.rb
#!/usr/bin/env ruby
require 'json'
require 'pp'
require 'curb'
# requires serialized flickr json file to be $stdin or specified on the command line and then
# downloads the files to the current directory
def chunk_array(array, pieces=2)
  len = array.length;
  mid = (len/pieces)
  chunks = []
  start = 0
  1.upto(pieces) do |i|
    last = start+mid
    last = last-1 unless len%pieces >= i
    chunks << array[start..last] || []
    start = last+1
  end
  chunks
end

def fetch_parallel(urls)
  Curl::Multi.download(urls){|c,code,method|
    filename = c.url.split(/\?/).first.split(/\//).last
    $stderr.puts 'fetching:', filename
  }
end

ARGF.each_line do |line|
  serializedJSON = line
  flickr_data_page =  JSON.parse(serializedJSON)
  total  = flickr_data_page["photos"]["total"].to_i
  total_pages = flickr_data_page["photos"]["pages"].to_i
  page = flickr_data_page["photos"]["page"].to_i
  $stderr.puts "Total photos to download:", total, "page:", page, " of:", total_pages

  total_to_download_for_this_page = 0
  if page == total_pages
    total_to_download_for_this_page = total % 250 # 250 per page
  else
    total_to_download_for_this_page = 250
  end
  urls = []
  0.upto(total_to_download_for_this_page - 1) do |i|
    urls[i] = flickr_data_page["photos"]["photo"][i]["url_o"]
  end
  chunks = chunk_array(urls, 25) # retrieve 10 urls at at time, each page is 250, so 25 chunks of urls
  chunks.each do |chunk_of_urls|
    if !chunk_of_urls.empty?
      fetch_parallel(chunk_of_urls)
    end
  end
end
	#!/usr/bin/env ruby
	require 'json'
	require 'pp'
	require 'curb'
	# requires serialized flickr json file to be $stdin or specified on the command line and then
	# downloads the files to the current directory
	def chunk_array(array, pieces=2)
	len = array.length;
	mid = (len/pieces)
	chunks = []
	start = 0
	1.upto(pieces) do \|i\|
	last = start+mid
	last = last-1 unless len%pieces >= i
	chunks << array[start..last] \|\| []
	start = last+1
	end
	chunks
	end

	def fetch_parallel(urls)
	Curl::Multi.download(urls){\|c,code,method\|
	filename = c.url.split(/\?/).first.split(/\//).last
	$stderr.puts 'fetching:', filename
	}
	end

	ARGF.each_line do \|line\|
	serializedJSON = line
	flickr_data_page = JSON.parse(serializedJSON)
	total = flickr_data_page["photos"]["total"].to_i
	total_pages = flickr_data_page["photos"]["pages"].to_i
	page = flickr_data_page["photos"]["page"].to_i
	$stderr.puts "Total photos to download:", total, "page:", page, " of:", total_pages

	total_to_download_for_this_page = 0
	if page == total_pages
	total_to_download_for_this_page = total % 250 # 250 per page
	else
	total_to_download_for_this_page = 250
	end
	urls = []
	0.upto(total_to_download_for_this_page - 1) do \|i\|
	urls[i] = flickr_data_page["photos"]["photo"][i]["url_o"]
	end
	chunks = chunk_array(urls, 25) # retrieve 10 urls at at time, each page is 250, so 25 chunks of urls
	chunks.each do \|chunk_of_urls\|
	if !chunk_of_urls.empty?
	fetch_parallel(chunk_of_urls)
	end
	end
	end