jolynch/S3_fast_upload.md

## example.txt
$ ls -laR
.:
total 6291456
drwxr-sr-x 3 user group        136 Jul 25 01:47 .
drwxrwsr-x 5 user group         67 Jul 25 01:48 ..
drwxr-sr-x 2 root group         46 Jul 25 01:40 foo
-rw-r--r-- 1 user group 1073741824 Jul 25 00:50 randwrite.2.0
-rw-r--r-- 1 user group 1073741824 Jul 25 00:50 randwrite.3.0
-rw-r--r-- 1 user group 1073741824 Jul 25 00:50 randwrite.4.0
-rw-r--r-- 1 user group 1073741824 Jul 25 00:50 randwrite.5.0
-rw-r--r-- 1 user group 1073741824 Jul 25 00:50 randwrite.6.0
-rw-r--r-- 1 user group 1073741824 Jul 25 00:50 randwrite.7.0

./foo:
total 2097152
drwxr-sr-x 2 root group         46 Jul 25 01:40 .
drwxr-sr-x 3 user group        136 Jul 25 01:47 ..
-rw-r--r-- 1 user group 1073741824 Jul 25 00:50 randwrite.0.0
-rw-r--r-- 1 user group 1073741824 Jul 25 00:50 randwrite.1.0

$ ../rapid_s3_upload.sh s3://some-bucket-name/test_josephl_upload
Using [/usr/bin/zstd -1 -c] to compress files and [/usr/local/bin/aws s3 cp -] to upload them
Uploading
randwrite.7.0 -> s3://some-bucket-name/test_josephl_upload/randwrite.7.0
randwrite.6.0 -> s3://some-bucket-name/test_josephl_upload/randwrite.6.0
randwrite.5.0 -> s3://some-bucket-name/test_josephl_upload/randwrite.5.0
randwrite.4.0 -> s3://some-bucket-name/test_josephl_upload/randwrite.4.0
randwrite.3.0 -> s3://some-bucket-name/test_josephl_upload/randwrite.3.0
randwrite.2.0 -> s3://some-bucket-name/test_josephl_upload/randwrite.2.0
foo/randwrite.0.0 -> s3://some-bucket-name/test_josephl_upload/foo/randwrite.0.0
foo/randwrite.1.0 -> s3://some-bucket-name/test_josephl_upload/foo/randwrite.1.0
Waiting 5s for you to cancel if you don't want this
foo/randwrite.1.0    : 16.09%   (1073741824 => 172779474 bytes, /*stdout*\)
foo/randwrite.0.0    : 16.08%   (1073741824 => 172607631 bytes, /*stdout*\)
randwrite.2.0        : 16.06%   (1073741824 => 172391826 bytes, /*stdout*\)
randwrite.5.0        : 16.05%   (1073741824 => 172364194 bytes, /*stdout*\)
randwrite.6.0        : 15.96%   (1073741824 => 171324403 bytes, /*stdout*\)
randwrite.7.0        : 16.05%   (1073741824 => 172388299 bytes, /*stdout*\)
randwrite.3.0        : 16.13%   (1073741824 => 173244139 bytes, /*stdout*\)
randwrite.4.0        : 16.08%   (1073741824 => 172652481 bytes, /*stdout*\)

real    0m15.059s
user    1m43.584s
sys     0m8.990s

$ COMPRESS="$(which lz4)" ../rapid_s3_upload.sh s3://some-bucket-name/test_josephl_upload
Using [/usr/bin/lz4 -c] to compress files and [/usr/local/bin/aws s3 cp -] to upload them
Uploading
randwrite.7.0 -> s3://some-bucket-name/test_josephl_upload/randwrite.7.0
randwrite.6.0 -> s3://some-bucket-name/test_josephl_upload/randwrite.6.0
randwrite.5.0 -> s3://some-bucket-name/test_josephl_upload/randwrite.5.0
randwrite.4.0 -> s3://some-bucket-name/test_josephl_upload/randwrite.4.0
randwrite.3.0 -> s3://some-bucket-name/test_josephl_upload/randwrite.3.0
randwrite.2.0 -> s3://some-bucket-name/test_josephl_upload/randwrite.2.0
foo/randwrite.0.0 -> s3://some-bucket-name/test_josephl_upload/foo/randwrite.0.0
foo/randwrite.1.0 -> s3://some-bucket-name/test_josephl_upload/foo/randwrite.1.0
Waiting 5s for you to cancel if you don't want this

real    0m36.217s
user    1m8.704s
sys     0m22.052s

## rapid_s3_upload.sh
#!/bin/bash

if [ -z "${COMPRESS}" ]; then
    COMPRESS=$(which zstd)
    if [ -z "${COMPRESS}" ]; then
        COMPRESS=$(which lz4)
    else
        # zstd is reasonably slow, so make it as fast as we can
        COMPRESS="${COMPRESS} -1"
    fi
fi

UPLOAD="$(which aws) s3 cp -"
BUCKET_PATH=$1

if [ -z "${COMPRESS}" ]; then
    echo "zstd or lz4 is required for maximum upload speeds, please install liblz4-tool"
    exit 1
else
    # lz4 default level is pretty reasonable
    COMPRESS="${COMPRESS} -c"
fi

echo "Using [${COMPRESS}] to compress files and [${UPLOAD}] to upload them"

if [ -z "${BUCKET_PATH}" ]; then
    echo "usage: rapid_s3_upload.sh s3://<bucket>/<path>..."
    exit 1
fi

echo "Uploading"
find . -type f -printf '%P\n' | xargs -IX bash -c "echo 'X -> ${BUCKET_PATH}/X'"

echo "Waiting 5s for you to cancel if you don't want this"
sleep 5

time find . -type f -printf '%P\n' | xargs -P $(getconf _NPROCESSORS_ONLN) -IX bash -c "${COMPRESS} X | ${UPLOAD} ${BUCKET_PATH}/X"

## S3_fast_upload.md

      
    Raw
  

              S3_fast_upload.md
            
          
    Upload files to S3 as fast as Possible

Often I hear people claiming S3 is slow, when in fact S3 is very fast but the user space program you are using is slow!
This gist provides a tool: rapid_s3_upload.sh which will upload the directory you run the tool in to a provided S3 bucket, as fast as humanly possible.
This tool will saturate your NICs, will saturate your CPUS, and will upload data to S3 faster than basically anything else.
rapid_s3_upload.sh s3://some-path/in-s3
Furthermore if you want to use a customer compressor you can run it with the COMPRESS parameter.
COMPRESS="$(which lz4) rapid_s3_upload.sh s3://some-path/in-s3
Under the hood

Under the hood this tool is using xargs to get maximum CPU parallelism, find to locate all files in the current working directory
and either zstd or lz4 to do the compression. Generally speaking the default zstd compressor is the right choice, but
when you have less compressible data and want to use less CPU. Currently the script doesn't support turning off compression but I
think it's pretty straightforward to refactor as you wish.
	$ ls -laR
	.:
	total 6291456
	drwxr-sr-x 3 user group 136 Jul 25 01:47 .
	drwxrwsr-x 5 user group 67 Jul 25 01:48 ..
	drwxr-sr-x 2 root group 46 Jul 25 01:40 foo
	-rw-r--r-- 1 user group 1073741824 Jul 25 00:50 randwrite.2.0
	-rw-r--r-- 1 user group 1073741824 Jul 25 00:50 randwrite.3.0
	-rw-r--r-- 1 user group 1073741824 Jul 25 00:50 randwrite.4.0
	-rw-r--r-- 1 user group 1073741824 Jul 25 00:50 randwrite.5.0
	-rw-r--r-- 1 user group 1073741824 Jul 25 00:50 randwrite.6.0
	-rw-r--r-- 1 user group 1073741824 Jul 25 00:50 randwrite.7.0

	./foo:
	total 2097152
	drwxr-sr-x 2 root group 46 Jul 25 01:40 .
	drwxr-sr-x 3 user group 136 Jul 25 01:47 ..
	-rw-r--r-- 1 user group 1073741824 Jul 25 00:50 randwrite.0.0
	-rw-r--r-- 1 user group 1073741824 Jul 25 00:50 randwrite.1.0

	$ ../rapid_s3_upload.sh s3://some-bucket-name/test_josephl_upload
	Using [/usr/bin/zstd -1 -c] to compress files and [/usr/local/bin/aws s3 cp -] to upload them
	Uploading
	randwrite.7.0 -> s3://some-bucket-name/test_josephl_upload/randwrite.7.0
	randwrite.6.0 -> s3://some-bucket-name/test_josephl_upload/randwrite.6.0
	randwrite.5.0 -> s3://some-bucket-name/test_josephl_upload/randwrite.5.0
	randwrite.4.0 -> s3://some-bucket-name/test_josephl_upload/randwrite.4.0
	randwrite.3.0 -> s3://some-bucket-name/test_josephl_upload/randwrite.3.0
	randwrite.2.0 -> s3://some-bucket-name/test_josephl_upload/randwrite.2.0
	foo/randwrite.0.0 -> s3://some-bucket-name/test_josephl_upload/foo/randwrite.0.0
	foo/randwrite.1.0 -> s3://some-bucket-name/test_josephl_upload/foo/randwrite.1.0
	Waiting 5s for you to cancel if you don't want this
	foo/randwrite.1.0 : 16.09% (1073741824 => 172779474 bytes, /stdout\)
	foo/randwrite.0.0 : 16.08% (1073741824 => 172607631 bytes, /stdout\)
	randwrite.2.0 : 16.06% (1073741824 => 172391826 bytes, /stdout\)
	randwrite.5.0 : 16.05% (1073741824 => 172364194 bytes, /stdout\)
	randwrite.6.0 : 15.96% (1073741824 => 171324403 bytes, /stdout\)
	randwrite.7.0 : 16.05% (1073741824 => 172388299 bytes, /stdout\)
	randwrite.3.0 : 16.13% (1073741824 => 173244139 bytes, /stdout\)
	randwrite.4.0 : 16.08% (1073741824 => 172652481 bytes, /stdout\)

	real 0m15.059s
	user 1m43.584s
	sys 0m8.990s

	$ COMPRESS="$(which lz4)" ../rapid_s3_upload.sh s3://some-bucket-name/test_josephl_upload
	Using [/usr/bin/lz4 -c] to compress files and [/usr/local/bin/aws s3 cp -] to upload them
	Uploading
	randwrite.7.0 -> s3://some-bucket-name/test_josephl_upload/randwrite.7.0
	randwrite.6.0 -> s3://some-bucket-name/test_josephl_upload/randwrite.6.0
	randwrite.5.0 -> s3://some-bucket-name/test_josephl_upload/randwrite.5.0
	randwrite.4.0 -> s3://some-bucket-name/test_josephl_upload/randwrite.4.0
	randwrite.3.0 -> s3://some-bucket-name/test_josephl_upload/randwrite.3.0
	randwrite.2.0 -> s3://some-bucket-name/test_josephl_upload/randwrite.2.0
	foo/randwrite.0.0 -> s3://some-bucket-name/test_josephl_upload/foo/randwrite.0.0
	foo/randwrite.1.0 -> s3://some-bucket-name/test_josephl_upload/foo/randwrite.1.0
	Waiting 5s for you to cancel if you don't want this

	real 0m36.217s
	user 1m8.704s
	sys 0m22.052s
	#!/bin/bash

	if [ -z "${COMPRESS}" ]; then
	COMPRESS=$(which zstd)
	if [ -z "${COMPRESS}" ]; then
	COMPRESS=$(which lz4)
	else
	# zstd is reasonably slow, so make it as fast as we can
	COMPRESS="${COMPRESS} -1"
	fi
	fi

	UPLOAD="$(which aws) s3 cp -"
	BUCKET_PATH=$1

	if [ -z "${COMPRESS}" ]; then
	echo "zstd or lz4 is required for maximum upload speeds, please install liblz4-tool"
	exit 1
	else
	# lz4 default level is pretty reasonable
	COMPRESS="${COMPRESS} -c"
	fi

	echo "Using [${COMPRESS}] to compress files and [${UPLOAD}] to upload them"

	if [ -z "${BUCKET_PATH}" ]; then
	echo "usage: rapid_s3_upload.sh s3://<bucket>/<path>..."
	exit 1
	fi

	echo "Uploading"
	find . -type f -printf '%P\n' \| xargs -IX bash -c "echo 'X -> ${BUCKET_PATH}/X'"

	echo "Waiting 5s for you to cancel if you don't want this"
	sleep 5

	time find . -type f -printf '%P\n' \| xargs -P $(getconf _NPROCESSORS_ONLN) -IX bash -c "${COMPRESS} X \| ${UPLOAD} ${BUCKET_PATH}/X"