Skip to content

Instantly share code, notes, and snippets.

@dlinsley
Last active September 28, 2023 17:18
Show Gist options
  • Save dlinsley/19632c38f660b236ba3f8656b76b2496 to your computer and use it in GitHub Desktop.
Save dlinsley/19632c38f660b236ba3f8656b76b2496 to your computer and use it in GitHub Desktop.
Find duplicate objects in an aws s3 bucket by comparing ETag
#!/usr/bin/env python3
import boto3
import argparse
import string
parser = argparse.ArgumentParser('Find duplicate objects in an aws s3 bucket')
parser.add_argument('--bucket', dest='myBucket', default='yourBucketName', help='S3 Bucket to search')
cliArgs = parser.parse_args()
myBucket = cliArgs.myBucket
# each list_objects_v2 request will return up to 1000 objects.
# We will loop for every 1000, make another list_objects_v2 until end of bucket is reached
lastReqLength = 1000
# at the end of each 1000, know the last key so we can get the next 1000 after it
lastKey = ""
existing = {}
s3 = boto3.client('s3')
print('searching for duplicate objects')
print('')
while lastReqLength == 1000:
if (lastKey == ""):
myObjects = s3.list_objects_v2(Bucket=myBucket)
else:
myObjects = s3.list_objects_v2(Bucket=myBucket,StartAfter=lastKey)
lastReqLength = len(myObjects['Contents'])
for obj in myObjects['Contents']:
lastKey = obj['Key']
thisKey = obj['Key']
thisSize = obj['Size']
thisEtag = obj['ETag']
if thisSize > 0:
if thisEtag in existing:
#duplicate found:
print('!!Duplicate: - %s - %s' % (existing[thisEtag], thisKey))
else:
existing[thisEtag] = thisKey
print('... The End.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment