krakatoa/threaded-multipart-with-fog.rb

## threaded-multipart-with-fog.rb
#!/usr/bin/env ruby
#
# Testing multipart uploads into s3 with threads
# Tested with Ruby 1.8 and 1.9

# This is proof of concept code, it works, but is not suitable for production, and may even have nasty bugs in the
# threading section

# Refs:
# http://docs.amazonwebservices.com/AmazonS3/latest/API/index.html?mpUploadInitiate.html
# http://docs.amazonwebservices.com/AWSEC2/latest/UserGuide/index.html?using-query-api.html <-- Query API auth
#

require 'rubygems'
require 'fog'
require 'digest/md5'
require 'base64'
require 'fileutils'


# Credentials
key = 'AAAA'
secret = 'BBBB'
bucket = 'some-bucket'

# Setup connection
stor = Fog::Storage.new(
  :provider => 'AWS',
  :aws_access_key_id => key,
  :aws_secret_access_key => secret
)


# Don't want to get caught out with any time errors
stor.sync_clock


# Take a test file and split it up, remove the initial / to use the filename and path as the key
#object_to_upload = '/tmp/linux-2.6.38.2.tar.bz2'
object_to_upload = '/tmp/ubuntu-10.04.2-server-amd64.iso'
object_key = object_to_upload[1..-1]

# Area to place the split file into
workdir = "/tmp/work"
FileUtils.mkdir_p(workdir)

# Split the file into chunks, the chunks are 000, 001, etc
#`split -C 10M -a 3 -d #{object_to_upload} #{workdir}`
`split -C 100M -a 3 -d #{object_to_upload} #{workdir}`


# Map of the file_part => md5
parts = {}

# Get the Base64 encoded MD5 of each file
Dir.entries(workdir).each do |file|
  next if file =~ /\.\./
  next if file =~ /\.$/

  md5 = Base64.encode64(Digest::MD5.file("#{workdir}/#{file}").digest).chomp!

  full_path = "#{workdir}/#{file}"

  parts[full_path] = md5
end


### Now ready to perform the actual upload

# Initiate the upload and get the uploadid
multi_part_up = stor.initiate_multipart_upload(bucket, object_key, { 'x-amz-acl' => 'private' } )
upload_id = multi_part_up.body["UploadId"]

# Lists for the threads and tags
tags = []
threads = []


sorted_parts = parts.sort_by do |d|
  d[0][-1].to_i
end

sorted_parts.each_with_index do |entry, idx|
  # Part numbers need to start at 1
  part_number = idx + 1

  # Reload to stop the connection timing out, useful when uploading large chunks
  stor.reload

  # Create a new thread for each part we are wanting to upload.
  threads << Thread.new(entry) do |e|
    print "DEBUG: Starting on File: #{e[0]} with MD5: #{e[1]} - this is part #{part_number} \n"

    # Pass fog a file object to upload
    File.open(e[0]) do |file_part|

      # The part_number changes each time, as does the file_part, however as they are set outside of the threads being created I *think* they are
      # safe. Really need to dig into the pickaxe threading section some more..
      part_upload = stor.upload_part(bucket, object_key, upload_id, part_number, file_part, { 'Content-MD5' => e[1] } )

      # You need to make sure the tags array has the tags in the correct order, else the upload won't complete
      tags[idx] = part_upload.headers["ETag"]

      print "#{part_upload.inspect} \n" # This will return when the part has uploaded
    end
  end
end

# Make sure all of our threads have finished before we continue
threads.each do |t|
  begin
    t.join
  rescue Exception => e
    puts "Failed: #{e.message}"
  end
end

# Might want a stor.reload here...

completed_upload = stor.complete_multipart_upload(bucket, object_key, upload_id, tags)
	#!/usr/bin/env ruby
	#
	# Testing multipart uploads into s3 with threads
	# Tested with Ruby 1.8 and 1.9

	# This is proof of concept code, it works, but is not suitable for production, and may even have nasty bugs in the
	# threading section

	# Refs:
	# http://docs.amazonwebservices.com/AmazonS3/latest/API/index.html?mpUploadInitiate.html
	# http://docs.amazonwebservices.com/AWSEC2/latest/UserGuide/index.html?using-query-api.html <-- Query API auth
	#

	require 'rubygems'
	require 'fog'
	require 'digest/md5'
	require 'base64'
	require 'fileutils'


	# Credentials
	key = 'AAAA'
	secret = 'BBBB'
	bucket = 'some-bucket'

	# Setup connection
	stor = Fog::Storage.new(
	:provider => 'AWS',
	:aws_access_key_id => key,
	:aws_secret_access_key => secret
	)


	# Don't want to get caught out with any time errors
	stor.sync_clock


	# Take a test file and split it up, remove the initial / to use the filename and path as the key
	#object_to_upload = '/tmp/linux-2.6.38.2.tar.bz2'
	object_to_upload = '/tmp/ubuntu-10.04.2-server-amd64.iso'
	object_key = object_to_upload[1..-1]

	# Area to place the split file into
	workdir = "/tmp/work"
	FileUtils.mkdir_p(workdir)

	# Split the file into chunks, the chunks are 000, 001, etc
	#`split -C 10M -a 3 -d #{object_to_upload} #{workdir}`
	`split -C 100M -a 3 -d #{object_to_upload} #{workdir}`


	# Map of the file_part => md5
	parts = {}

	# Get the Base64 encoded MD5 of each file
	Dir.entries(workdir).each do \|file\|
	next if file =~ /\.\./
	next if file =~ /\.$/

	md5 = Base64.encode64(Digest::MD5.file("#{workdir}/#{file}").digest).chomp!

	full_path = "#{workdir}/#{file}"

	parts[full_path] = md5
	end


	### Now ready to perform the actual upload

	# Initiate the upload and get the uploadid
	multi_part_up = stor.initiate_multipart_upload(bucket, object_key, { 'x-amz-acl' => 'private' } )
	upload_id = multi_part_up.body["UploadId"]

	# Lists for the threads and tags
	tags = []
	threads = []


	sorted_parts = parts.sort_by do \|d\|
	d[0][-1].to_i
	end

	sorted_parts.each_with_index do \|entry, idx\|
	# Part numbers need to start at 1
	part_number = idx + 1

	# Reload to stop the connection timing out, useful when uploading large chunks
	stor.reload

	# Create a new thread for each part we are wanting to upload.
	threads << Thread.new(entry) do \|e\|
	print "DEBUG: Starting on File: #{e[0]} with MD5: #{e[1]} - this is part #{part_number} \n"

	# Pass fog a file object to upload
	File.open(e[0]) do \|file_part\|

	# The part_number changes each time, as does the file_part, however as they are set outside of the threads being created I think they are
	# safe. Really need to dig into the pickaxe threading section some more..
	part_upload = stor.upload_part(bucket, object_key, upload_id, part_number, file_part, { 'Content-MD5' => e[1] } )

	# You need to make sure the tags array has the tags in the correct order, else the upload won't complete
	tags[idx] = part_upload.headers["ETag"]

	print "#{part_upload.inspect} \n" # This will return when the part has uploaded
	end
	end
	end

	# Make sure all of our threads have finished before we continue
	threads.each do \|t\|
	begin
	t.join
	rescue Exception => e
	puts "Failed: #{e.message}"
	end
	end

	# Might want a stor.reload here...

	completed_upload = stor.complete_multipart_upload(bucket, object_key, upload_id, tags)