mh0w/BOTO3 basics.py

## BOTO3 basics.py
#################################################################
# Reading and writing xlsx files Sparklessly from S3 with BOTO3 #
#################################################################

import boto3
import raz_client
import pandas as pd
import io

my_bucket = "bucket_name_goes_here"
input_csv = "folder/path/goes/here/animal_rescue.xlsx"
input_xlsx = "folder/path/goes/here/animal_rescue.csv"
input_parquet = "folder/path/goes/here/animal_rescue.parquet"
output_path = "folder/path/goes/here/test"
my_ssl_path = "/folder/path/goes/here/certs/ca-bundle.crt"

client = boto3.client("s3")  # vs boto3.resource("s3")
raz_client.configure_ranger_raz(client, ssl_file=my_ssl_path)


############
# Read csv #
############

with client.get_object(Bucket=my_bucket, Key=input_csv)["Body"] as f:
    my_df = pd.read_csv(f)

my_df.head()


#############
# Read xlsx #
#############

with io.BytesIO(client.get_object(Bucket=my_bucket, Key=input_xlsx)["Body"].read()) as f:
    my_df_2 = pd.read_excel(f)

my_df_2.head()


#############
# Read parquet #
#############

with io.BytesIO(client.get_object(Bucket=my_bucket, Key=input_parquet)["Body"].read()) as f:
    my_df_3 = pd.read_parquet(f)

my_df_3.head()


#############
# Write csv #
#############

with io.BytesIO() as output:
    my_df.to_csv(output)
    client.put_object(Bucket=my_bucket, Key=output_path+".csv", Body=csv_buffer.getvalue())


##############
# Write xlsx #
##############

# Create workbook object, add df as a sheet, and write out to an .xlsx file
with io.BytesIO() as output:
    with pd.ExcelWriter(output, engine='openpyxl') as writer:
        my_df.to_excel(writer)
    client.put_object(Bucket=my_bucket, Key=output_path+".xlsx", Body=output.getvalue())


#################
# Write parquet #
#################

with io.BytesIO() as output:
    my_df.to_parquet(output)
    client.put_object(Bucket=my_bucket, Key=output_path+".parquet", Body=output.getvalue())


# List the files in the output folder
client.list_objects(Bucket=my_bucket, Prefix=output_path).get("Contents")
	#################################################################
	# Reading and writing xlsx files Sparklessly from S3 with BOTO3 #
	#################################################################

	import boto3
	import raz_client
	import pandas as pd
	import io

	my_bucket = "bucket_name_goes_here"
	input_csv = "folder/path/goes/here/animal_rescue.xlsx"
	input_xlsx = "folder/path/goes/here/animal_rescue.csv"
	input_parquet = "folder/path/goes/here/animal_rescue.parquet"
	output_path = "folder/path/goes/here/test"
	my_ssl_path = "/folder/path/goes/here/certs/ca-bundle.crt"

	client = boto3.client("s3") # vs boto3.resource("s3")
	raz_client.configure_ranger_raz(client, ssl_file=my_ssl_path)


	############
	# Read csv #
	############

	with client.get_object(Bucket=my_bucket, Key=input_csv)["Body"] as f:
	my_df = pd.read_csv(f)

	my_df.head()


	#############
	# Read xlsx #
	#############

	with io.BytesIO(client.get_object(Bucket=my_bucket, Key=input_xlsx)["Body"].read()) as f:
	my_df_2 = pd.read_excel(f)

	my_df_2.head()


	#############
	# Read parquet #
	#############

	with io.BytesIO(client.get_object(Bucket=my_bucket, Key=input_parquet)["Body"].read()) as f:
	my_df_3 = pd.read_parquet(f)

	my_df_3.head()


	#############
	# Write csv #
	#############

	with io.BytesIO() as output:
	my_df.to_csv(output)
	client.put_object(Bucket=my_bucket, Key=output_path+".csv", Body=csv_buffer.getvalue())


	##############
	# Write xlsx #
	##############

	# Create workbook object, add df as a sheet, and write out to an .xlsx file
	with io.BytesIO() as output:
	with pd.ExcelWriter(output, engine='openpyxl') as writer:
	my_df.to_excel(writer)
	client.put_object(Bucket=my_bucket, Key=output_path+".xlsx", Body=output.getvalue())


	#################
	# Write parquet #
	#################

	with io.BytesIO() as output:
	my_df.to_parquet(output)
	client.put_object(Bucket=my_bucket, Key=output_path+".parquet", Body=output.getvalue())


	# List the files in the output folder
	client.list_objects(Bucket=my_bucket, Prefix=output_path).get("Contents")