Skip to content

Instantly share code, notes, and snippets.

@mh0w
Last active June 21, 2024 13:39
Show Gist options
  • Save mh0w/4985ae266eec9288eebb962473c5bf06 to your computer and use it in GitHub Desktop.
Save mh0w/4985ae266eec9288eebb962473c5bf06 to your computer and use it in GitHub Desktop.
BOTO3 basics
#################################################################
# Reading and writing xlsx files Sparklessly from S3 with BOTO3 #
#################################################################
import boto3
import raz_client
import pandas as pd
import io
my_bucket = "bucket_name_goes_here"
input_csv = "folder/path/goes/here/animal_rescue.xlsx"
input_xlsx = "folder/path/goes/here/animal_rescue.csv"
input_parquet = "folder/path/goes/here/animal_rescue.parquet"
output_path = "folder/path/goes/here/test"
my_ssl_path = "/folder/path/goes/here/certs/ca-bundle.crt"
client = boto3.client("s3") # vs boto3.resource("s3")
raz_client.configure_ranger_raz(client, ssl_file=my_ssl_path)
############
# Read csv #
############
with client.get_object(Bucket=my_bucket, Key=input_csv)["Body"] as f:
my_df = pd.read_csv(f)
my_df.head()
#############
# Read xlsx #
#############
with io.BytesIO(client.get_object(Bucket=my_bucket, Key=input_xlsx)["Body"].read()) as f:
my_df_2 = pd.read_excel(f)
my_df_2.head()
#############
# Read parquet #
#############
with io.BytesIO(client.get_object(Bucket=my_bucket, Key=input_parquet)["Body"].read()) as f:
my_df_3 = pd.read_parquet(f)
my_df_3.head()
#############
# Write csv #
#############
with io.BytesIO() as output:
my_df.to_csv(output)
client.put_object(Bucket=my_bucket, Key=output_path+".csv", Body=csv_buffer.getvalue())
##############
# Write xlsx #
##############
# Create workbook object, add df as a sheet, and write out to an .xlsx file
with io.BytesIO() as output:
with pd.ExcelWriter(output, engine='openpyxl') as writer:
my_df.to_excel(writer)
client.put_object(Bucket=my_bucket, Key=output_path+".xlsx", Body=output.getvalue())
#################
# Write parquet #
#################
with io.BytesIO() as output:
my_df.to_parquet(output)
client.put_object(Bucket=my_bucket, Key=output_path+".parquet", Body=output.getvalue())
# List the files in the output folder
client.list_objects(Bucket=my_bucket, Prefix=output_path).get("Contents")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment