#!/bin/bash | |
if [ $# -ne 2 ]; then | |
echo "Usage: $0 file partSizeInMb"; | |
exit 0; | |
fi | |
file=$1 | |
if [ ! -f "$file" ]; then | |
echo "Error: $file not found." | |
exit 1; | |
fi | |
partSizeInMb=$2 | |
fileSizeInMb=$(du -m "$file" | cut -f 1) | |
parts=$((fileSizeInMb / partSizeInMb)) | |
if [[ $((fileSizeInMb % partSizeInMb)) -gt 0 ]]; then | |
parts=$((parts + 1)); | |
fi | |
checksumFile=$(mktemp -t s3md5) | |
for (( part=0; part<$parts; part++ )) | |
do | |
skip=$((partSizeInMb * part)) | |
$(dd bs=1m count=$partSizeInMb skip=$skip if="$file" 2>/dev/null | md5 >>$checksumFile) | |
done | |
echo $(xxd -r -p $checksumFile | md5)-$parts | |
rm $checksumFile |
This comment has been minimized.
This comment has been minimized.
Thanks for this, really helped me! |
This comment has been minimized.
This comment has been minimized.
Very cool. I have a patch to make it work for Linux, if that's of interest. I'll fork and PR if so. One file I have doesn't match S3's MD5 sum, even after multiple downloads. Chunk size is rather big (512 MB). Not that a hash tells us much, but Amazon says its: Whereas your script says: File size is 609865657 bytes. Different algorithm for big files? Doesn't really make sense. |
This comment has been minimized.
This comment has been minimized.
How do you know what part size was used/to use? |
This comment has been minimized.
This comment has been minimized.
@RichardBronosky
We should use this, if uploaded with
|
This comment has been minimized.
This comment has been minimized.
Thanks for this, it helped me validate a heap of files I had in S3. Note that AWS S3 supports a maximum of 10,000 parts. I recently exceeded this on a project with a 54GB file (5MB part size). The AWS SDK adjusts the part size to fit 10,000 parts. I used this expression to get the right part size to calculate the ETag correctly, if you happen to exceed 10,000 parts. I also specified the partsize in bytes for better accuracy. partsize = (filesize / 10000) + 1 |
This comment has been minimized.
This comment has been minimized.
Thank you. This is helpful. Are there any alternatives for |
This comment has been minimized.
This comment has been minimized.
Awesome script - it doesn't work for SSE-KMS files so if you happen to uncover any intel on how AWS is generating the MD5 for that scenario, please share. Again, awesome job here. |
This comment has been minimized.
This comment has been minimized.
Thanks for the script, this is incredibly helpful and we're incorporating it into our workflows! |
This comment has been minimized.
This comment has been minimized.
Linux usersHere is an equivalent script if you are not using OSX. I hope this helps! #!/bin/bash
set -euo pipefail
if [ $# -ne 2 ]; then
echo "Usage: $0 file partSizeInMb";
exit 0;
fi
file=$1
if [ ! -f "$file" ]; then
echo "Error: $file not found."
exit 1;
fi
partSizeInMb=$2
fileSizeInMb=$(du -m "$file" | cut -f 1)
parts=$((fileSizeInMb / partSizeInMb))
if [[ $((fileSizeInMb % partSizeInMb)) -gt 0 ]]; then
parts=$((parts + 1));
fi
checksumFile=$(mktemp -t s3md5.XXXXXXXXXXXXX)
for (( part=0; part<$parts; part++ ))
do
skip=$((partSizeInMb * part))
$(dd bs=1M count=$partSizeInMb skip=$skip if="$file" 2> /dev/null | md5sum >> $checksumFile)
done
etag=$(echo $(xxd -r -p $checksumFile | md5sum)-$parts | sed 's/ --/-/')
echo -e "${1}\t${etag}"
rm $checksumFile |
This comment has been minimized.
Awesome script! Thank you.