Created
March 20, 2021 08:39
-
-
Save iamarchisha/615a3b7f13ed9a55e169e1fedda7c481 to your computer and use it in GitHub Desktop.
To copy files belonging to a category with a keyword present, from one AWS S3 bucket to another.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import boto3 | |
# Get all the keys in S3 bucket | |
def get_matching_s3_keys(bucket, prefix='', suffix=''): | |
""" | |
Generate the keys in an S3 bucket. | |
Args: | |
bucket (str): Name of the S3 bucket. | |
prefix (str): Only fetch keys that start with this prefix (optional). | |
suffix (str): Only fetch keys that end with this suffix (optional). | |
""" | |
s3 = boto3.client('s3') | |
kwargs = {'Bucket': bucket} | |
# If the prefix is a single string (not a tuple of strings), we can | |
# do the filtering directly in the S3 API. | |
if isinstance(prefix, str): | |
kwargs['Prefix'] = prefix | |
while True: | |
# The S3 API response is a large blob of metadata. | |
# 'Contents' contains information about the listed objects. | |
resp = s3.list_objects_v2(**kwargs) | |
for obj in resp['Contents']: | |
key = obj['Key'] | |
if key.startswith(prefix) and key.endswith(suffix): | |
yield key | |
# The S3 API is paginated, returning up to 1000 keys at a time. | |
# Pass the continuation token into the next response, until we | |
# reach the final page (when this field is missing). | |
try: | |
kwargs['ContinuationToken'] = resp['NextContinuationToken'] | |
except KeyError: | |
break | |
# Generate list of all files in bucket | |
def bucket_keys(bucket): | |
""" | |
Generates a list containing all the files present in the bucket. | |
Args: | |
bucket (str): Name of the S3 bucket. | |
""" | |
keys_count = 0 | |
keys_name = [] | |
for key in get_matching_s3_keys(bucket=bucket): | |
keys_count = keys_count + 1 | |
keys_name.append(key) | |
# check number of keys in the bucket | |
print("There are {0} files in the bucket".format(keys_count)) | |
return keys_name | |
# Find files with a string in the bucket | |
def find_key_with(bucket, name): | |
""" | |
Generates a list of files in the bucket starting with the given name. | |
Args: | |
bucket (str): Name of the S3 bucket in which files are to be compared. | |
name (str): The keyword for which files in bucket will be searched. | |
""" | |
keys_with_name = [] | |
for key in keys_name: | |
if name in key: | |
keys_with_name.append(bucket+key) | |
return keys_with_name | |
# List all the unique categories/folder names in bucket | |
def unique_category(key_list): | |
""" | |
Fine all the unique categories from the list generated by find_key_with(). | |
The list must contain paths of the format: | |
's3://<bucket-name>/<folder-name>/<file-name>' | |
This fucntion will return a list of unique folder names. | |
Args: | |
key_list (list): The list of file paths in S3. | |
""" | |
unique_keys = [] | |
for key in key_list: | |
index = [i for i, ltr in enumerate(key) if ltr == "/"] | |
unique_key = key[index[2]+1:index[3]] | |
if unique_key not in unique_keys: | |
unique_keys.append(unique_key) | |
print("{0} unique folder names were found having a total of {1} files".format(len(unique_keys), len(key_list))) | |
return unique_keys | |
# List of files belonging to a category/folder in S3 bucket | |
def files_list(bucket,key_category): | |
""" | |
Generate a list of files under a specific folder. | |
Args: | |
bucket (str): Name of the S3 bucket in which the folder is present. | |
key_category (str): Name of the folder. | |
""" | |
files = find_key_with(bucket,key_category) | |
print("{} files under {} were found".format(len(files), key_category)) | |
return files | |
# Copy files to destination S3 bucket. | |
def move_files(source_list, destination): | |
""" | |
Move the files in source list to the destination. | |
Args: | |
source_list (list): List of all the file with full path that need to be copied. | |
destination (str) : The path to which the files need to be copied. | |
"s3://<bucket-name>/<destination-folder-name>/" | |
""" | |
for i in source_list: | |
os.system("aws s3 cp '{0}' '{1}'".format(i,destination)) | |
print("Files copied!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment