Skip to content

Instantly share code, notes, and snippets.

@iamarchisha
Created March 20, 2021 08:39
Show Gist options
  • Save iamarchisha/615a3b7f13ed9a55e169e1fedda7c481 to your computer and use it in GitHub Desktop.
Save iamarchisha/615a3b7f13ed9a55e169e1fedda7c481 to your computer and use it in GitHub Desktop.
To copy files belonging to a category with a keyword present, from one AWS S3 bucket to another.
import os
import boto3
# Get all the keys in S3 bucket
def get_matching_s3_keys(bucket, prefix='', suffix=''):
"""
Generate the keys in an S3 bucket.
Args:
bucket (str): Name of the S3 bucket.
prefix (str): Only fetch keys that start with this prefix (optional).
suffix (str): Only fetch keys that end with this suffix (optional).
"""
s3 = boto3.client('s3')
kwargs = {'Bucket': bucket}
# If the prefix is a single string (not a tuple of strings), we can
# do the filtering directly in the S3 API.
if isinstance(prefix, str):
kwargs['Prefix'] = prefix
while True:
# The S3 API response is a large blob of metadata.
# 'Contents' contains information about the listed objects.
resp = s3.list_objects_v2(**kwargs)
for obj in resp['Contents']:
key = obj['Key']
if key.startswith(prefix) and key.endswith(suffix):
yield key
# The S3 API is paginated, returning up to 1000 keys at a time.
# Pass the continuation token into the next response, until we
# reach the final page (when this field is missing).
try:
kwargs['ContinuationToken'] = resp['NextContinuationToken']
except KeyError:
break
# Generate list of all files in bucket
def bucket_keys(bucket):
"""
Generates a list containing all the files present in the bucket.
Args:
bucket (str): Name of the S3 bucket.
"""
keys_count = 0
keys_name = []
for key in get_matching_s3_keys(bucket=bucket):
keys_count = keys_count + 1
keys_name.append(key)
# check number of keys in the bucket
print("There are {0} files in the bucket".format(keys_count))
return keys_name
# Find files with a string in the bucket
def find_key_with(bucket, name):
"""
Generates a list of files in the bucket starting with the given name.
Args:
bucket (str): Name of the S3 bucket in which files are to be compared.
name (str): The keyword for which files in bucket will be searched.
"""
keys_with_name = []
for key in keys_name:
if name in key:
keys_with_name.append(bucket+key)
return keys_with_name
# List all the unique categories/folder names in bucket
def unique_category(key_list):
"""
Fine all the unique categories from the list generated by find_key_with().
The list must contain paths of the format:
's3://<bucket-name>/<folder-name>/<file-name>'
This fucntion will return a list of unique folder names.
Args:
key_list (list): The list of file paths in S3.
"""
unique_keys = []
for key in key_list:
index = [i for i, ltr in enumerate(key) if ltr == "/"]
unique_key = key[index[2]+1:index[3]]
if unique_key not in unique_keys:
unique_keys.append(unique_key)
print("{0} unique folder names were found having a total of {1} files".format(len(unique_keys), len(key_list)))
return unique_keys
# List of files belonging to a category/folder in S3 bucket
def files_list(bucket,key_category):
"""
Generate a list of files under a specific folder.
Args:
bucket (str): Name of the S3 bucket in which the folder is present.
key_category (str): Name of the folder.
"""
files = find_key_with(bucket,key_category)
print("{} files under {} were found".format(len(files), key_category))
return files
# Copy files to destination S3 bucket.
def move_files(source_list, destination):
"""
Move the files in source list to the destination.
Args:
source_list (list): List of all the file with full path that need to be copied.
destination (str) : The path to which the files need to be copied.
"s3://<bucket-name>/<destination-folder-name>/"
"""
for i in source_list:
os.system("aws s3 cp '{0}' '{1}'".format(i,destination))
print("Files copied!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment