Last active
June 21, 2024 13:39
-
-
Save mh0w/4985ae266eec9288eebb962473c5bf06 to your computer and use it in GitHub Desktop.
BOTO3 basics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################################# | |
# Reading and writing xlsx files Sparklessly from S3 with BOTO3 # | |
################################################################# | |
import boto3 | |
import raz_client | |
import pandas as pd | |
import io | |
my_bucket = "bucket_name_goes_here" | |
input_csv = "folder/path/goes/here/animal_rescue.xlsx" | |
input_xlsx = "folder/path/goes/here/animal_rescue.csv" | |
input_parquet = "folder/path/goes/here/animal_rescue.parquet" | |
output_path = "folder/path/goes/here/test" | |
my_ssl_path = "/folder/path/goes/here/certs/ca-bundle.crt" | |
client = boto3.client("s3") # vs boto3.resource("s3") | |
raz_client.configure_ranger_raz(client, ssl_file=my_ssl_path) | |
############ | |
# Read csv # | |
############ | |
with client.get_object(Bucket=my_bucket, Key=input_csv)["Body"] as f: | |
my_df = pd.read_csv(f) | |
my_df.head() | |
############# | |
# Read xlsx # | |
############# | |
with io.BytesIO(client.get_object(Bucket=my_bucket, Key=input_xlsx)["Body"].read()) as f: | |
my_df_2 = pd.read_excel(f) | |
my_df_2.head() | |
############# | |
# Read parquet # | |
############# | |
with io.BytesIO(client.get_object(Bucket=my_bucket, Key=input_parquet)["Body"].read()) as f: | |
my_df_3 = pd.read_parquet(f) | |
my_df_3.head() | |
############# | |
# Write csv # | |
############# | |
with io.BytesIO() as output: | |
my_df.to_csv(output) | |
client.put_object(Bucket=my_bucket, Key=output_path+".csv", Body=csv_buffer.getvalue()) | |
############## | |
# Write xlsx # | |
############## | |
# Create workbook object, add df as a sheet, and write out to an .xlsx file | |
with io.BytesIO() as output: | |
with pd.ExcelWriter(output, engine='openpyxl') as writer: | |
my_df.to_excel(writer) | |
client.put_object(Bucket=my_bucket, Key=output_path+".xlsx", Body=output.getvalue()) | |
################# | |
# Write parquet # | |
################# | |
with io.BytesIO() as output: | |
my_df.to_parquet(output) | |
client.put_object(Bucket=my_bucket, Key=output_path+".parquet", Body=output.getvalue()) | |
# List the files in the output folder | |
client.list_objects(Bucket=my_bucket, Prefix=output_path).get("Contents") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment