Skip to content

Instantly share code, notes, and snippets.

@ryandhubbard
Created October 21, 2023 22:04
Show Gist options
  • Save ryandhubbard/e72e764168a30d4e9ce8aadc580f470b to your computer and use it in GitHub Desktop.
Save ryandhubbard/e72e764168a30d4e9ce8aadc580f470b to your computer and use it in GitHub Desktop.
Clean up a s3 bucket folder and copy data by date files
import pandas as pd
import boto3
from io import StringIO
import datetime
import warnings
#suppress warnings
warnings.filterwarnings("ignore")
def main():
# Define the S3 paths
destination_path = "s3://"
s3_client = boto3.client('s3')
bucket_name = ''
file_key = 'folder/'
# Read all files in the directory into a dataframe
obj_list = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=file_key)
paginator = s3_client.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=file_key)
for page in page_iterator:
for content in page['Contents']:
if 'Contents' in obj_list:
df = pd.DataFrame()
for content in obj_list['Contents']:
# Access 'Key' from the 'content' dictionary
key = content['Key']
# Filter for just csv files
if key.endswith('.csv'):
obj = s3_client.get_object(Bucket=bucket_name, Key=key)
obj_df = pd.read_csv(obj['Body'])
df = df.append(obj_df)
# Drop ad text column and AG Placement
df = df.drop(['column', 'column2'], axis=1)
# Group the DataFrame by the "date_column"
groups = df.groupby("Date")
# Iterate over the groups and write them to separate files
for date_str, group in groups:
# Convert the date_str to a datetime object
date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
# Format the date as a string
formatted_date = date.strftime("%Y-%m-%d")
# Define the path for the destination file
output_path = f"{destination_path}{formatted_date}.csv"
# Write the group to the destination file
group.to_csv(output_path, index=False) # Use to_csv to write DataFrame to CSV
else:
print("No objects found in the specified S3 bucket and prefix.")
# Commit the job
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment