Created
October 21, 2023 22:04
-
-
Save ryandhubbard/e72e764168a30d4e9ce8aadc580f470b to your computer and use it in GitHub Desktop.
Clean up a s3 bucket folder and copy data by date files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import boto3 | |
from io import StringIO | |
import datetime | |
import warnings | |
#suppress warnings | |
warnings.filterwarnings("ignore") | |
def main(): | |
# Define the S3 paths | |
destination_path = "s3://" | |
s3_client = boto3.client('s3') | |
bucket_name = '' | |
file_key = 'folder/' | |
# Read all files in the directory into a dataframe | |
obj_list = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=file_key) | |
paginator = s3_client.get_paginator('list_objects_v2') | |
page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=file_key) | |
for page in page_iterator: | |
for content in page['Contents']: | |
if 'Contents' in obj_list: | |
df = pd.DataFrame() | |
for content in obj_list['Contents']: | |
# Access 'Key' from the 'content' dictionary | |
key = content['Key'] | |
# Filter for just csv files | |
if key.endswith('.csv'): | |
obj = s3_client.get_object(Bucket=bucket_name, Key=key) | |
obj_df = pd.read_csv(obj['Body']) | |
df = df.append(obj_df) | |
# Drop ad text column and AG Placement | |
df = df.drop(['column', 'column2'], axis=1) | |
# Group the DataFrame by the "date_column" | |
groups = df.groupby("Date") | |
# Iterate over the groups and write them to separate files | |
for date_str, group in groups: | |
# Convert the date_str to a datetime object | |
date = datetime.datetime.strptime(date_str, "%Y-%m-%d") | |
# Format the date as a string | |
formatted_date = date.strftime("%Y-%m-%d") | |
# Define the path for the destination file | |
output_path = f"{destination_path}{formatted_date}.csv" | |
# Write the group to the destination file | |
group.to_csv(output_path, index=False) # Use to_csv to write DataFrame to CSV | |
else: | |
print("No objects found in the specified S3 bucket and prefix.") | |
# Commit the job | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment