ryandhubbard/clean_s3_folder.py

## clean_s3_folder.py
import pandas as pd
import boto3
from io import StringIO
import datetime
import warnings

#suppress warnings
warnings.filterwarnings("ignore")

def main():

    # Define the S3 paths
    destination_path = "s3://"

    s3_client = boto3.client('s3')

    bucket_name = ''
    file_key = 'folder/'

    # Read all files in the directory into a dataframe
    obj_list = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=file_key)

    paginator = s3_client.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=file_key)

    for page in page_iterator:
        for content in page['Contents']:

            if 'Contents' in obj_list:
                df = pd.DataFrame()

                for content in obj_list['Contents']:
                    # Access 'Key' from the 'content' dictionary
                    key = content['Key']

                    # Filter for just csv files
                    if key.endswith('.csv'):
                        obj = s3_client.get_object(Bucket=bucket_name, Key=key)
                        obj_df = pd.read_csv(obj['Body'])
                        df = df.append(obj_df)

                # Drop ad text column and AG Placement
                df = df.drop(['column', 'column2'], axis=1)

                # Group the DataFrame by the "date_column"
                groups = df.groupby("Date")

                # Iterate over the groups and write them to separate files
                for date_str, group in groups:
                    # Convert the date_str to a datetime object
                    date = datetime.datetime.strptime(date_str, "%Y-%m-%d")

                    # Format the date as a string
                    formatted_date = date.strftime("%Y-%m-%d")

                    # Define the path for the destination file
                    output_path = f"{destination_path}{formatted_date}.csv"

                    # Write the group to the destination file
                    group.to_csv(output_path, index=False)  # Use to_csv to write DataFrame to CSV

    else:
        print("No objects found in the specified S3 bucket and prefix.")

# Commit the job
if __name__ == "__main__":
    main()
	import pandas as pd
	import boto3
	from io import StringIO
	import datetime
	import warnings

	#suppress warnings
	warnings.filterwarnings("ignore")

	def main():

	# Define the S3 paths
	destination_path = "s3://"

	s3_client = boto3.client('s3')

	bucket_name = ''
	file_key = 'folder/'

	# Read all files in the directory into a dataframe
	obj_list = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=file_key)

	paginator = s3_client.get_paginator('list_objects_v2')
	page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=file_key)

	for page in page_iterator:
	for content in page['Contents']:

	if 'Contents' in obj_list:
	df = pd.DataFrame()

	for content in obj_list['Contents']:
	# Access 'Key' from the 'content' dictionary
	key = content['Key']

	# Filter for just csv files
	if key.endswith('.csv'):
	obj = s3_client.get_object(Bucket=bucket_name, Key=key)
	obj_df = pd.read_csv(obj['Body'])
	df = df.append(obj_df)

	# Drop ad text column and AG Placement
	df = df.drop(['column', 'column2'], axis=1)

	# Group the DataFrame by the "date_column"
	groups = df.groupby("Date")

	# Iterate over the groups and write them to separate files
	for date_str, group in groups:
	# Convert the date_str to a datetime object
	date = datetime.datetime.strptime(date_str, "%Y-%m-%d")

	# Format the date as a string
	formatted_date = date.strftime("%Y-%m-%d")

	# Define the path for the destination file
	output_path = f"{destination_path}{formatted_date}.csv"

	# Write the group to the destination file
	group.to_csv(output_path, index=False) # Use to_csv to write DataFrame to CSV

	else:
	print("No objects found in the specified S3 bucket and prefix.")

	# Commit the job
	if __name__ == "__main__":
	main()