alecbw/awswrangler Athena+Glue+Redshift Example Functions.py

## awswrangler Athena+Glue+Redshift Example Functions.py
import awswrangler as wr
import pandas as pd

get current IAM role/user
name = wr.sts.get_current_identity_name()
arn = wr.sts.get_current_identity_arn()

# Reading files
df = wr.s3.read_csv(f"s3://sample-bucket/sample.csv") #you can optionally select a subset of columns with names=['col_name1'] and parse date cols with: parse_dates=["col_name2"]
df = wr.s3.read_json(f"s3://sample-bucket/sample.json")
df = wr.s3.read_parquet(f"s3://sample-bucket/sample.parquet") # you can read individual parquet files or every parquet file in a subfolder

df = wr.athena.read_sql_query("""SELECT * FROM table_name limit 100""", database="test_database")

# Writing files
wr.s3.to_csv(df, f"s3://sample-bucket/sample_2.csv", index=False) # note there’s a specific CSV<>Athena function below
wr.s3.to_json(df, f"s3://sample-bucket/sample_2.json")
wr.s3.to_parquet(df, f"s3://sample-bucket/sample_2.parquet")

# Retrieve databases in Glue catalog
wr.catalog.databases()

# Retrieve tables in a Glue database
wr.catalog.tables(database="sample_db")

# Search a table in Glue database
wr.catalog.tables(name_contains="sample")
wr.catalog.tables(name_prefix="sample_")
wr.catalog.tables(name_suffix="_table")
wr.catalog.tables(search_text="table description")

# Retrieve table details
wr.catalog.table(database="sample_db", table="sample_table")

# Delete a table
wr.catalog.delete_table_if_exists(database="sample_db", table="sample_table")

# Delete a database
wr.catalog.delete_database('sample_db')

# Get SQLAlchemy Engine from a Glue Catalog Connection
wr.catalog.get_engine(name='sample_connection')

# Get SQLAlchemy Engine using the given db parameters
postgres_engine = wr.db.get_engine(
    db_type="postgresql",
    host="127.0.0.1",
    port=5432,
    database="sample_db",
    user="sample_user",
    password="sample_password"
)
# Reading data from table in database
wr.db.read_sql_query("SELECT * FROM public.sample_table", con=postgres_engine)

# Writing data into table in database
wr.db.to_sql(df, postgres_engine, schema="public", name="sample_table", if_exists="replace", index=False)

# Load data into redshift using COPY command
wr.db.copy_to_redshift(
    df=df,
    path=path,
    con=redshift_engine,
    schema="public",
    table="sample_table",
    mode="overwrite",
    iam_role=iam_role
)

# Unload data from redshift using UNLOAD command
wr.db.unload_redshift(
    sql="SELECT * FROM public.sample_table",
    con=redshift_engine,
    iam_role=iam_role,
    path=path,
    keep_files=True
)

# Read data from Athena using SQL query
wr.athena.read_sql_query("SELECT * FROM sample_table", database="sample_db")

# Read data from Athena using SQL query using specific chunksize
df_chunks = wr.athena.read_sql_query(
    "SELECT * FROM sample_table",
    database="sample_db",
    chunksize=10000, # or True to let wrangler ddecide for you
    categories = ["col1", "col2] # can improve performance a lot
)
for df in dfs:
    print(len(df.index))

# merging the underlying s3 paths of two datasets into one
wr.s3.merge_datasets(
    source_path=path2,
    target_path=path1,
    mode="append"
)

# use  wr.catalog.extract_athena_types() to get partitions_types and columns_types
wr.catalog.create_csv_table(
    table="csv_crawler",
    database="awswrangler_test",
    path=path,
    partitions_types=partitions_types,
    columns_types=columns_types
)
	import awswrangler as wr
	import pandas as pd

	get current IAM role/user
	name = wr.sts.get_current_identity_name()
	arn = wr.sts.get_current_identity_arn()

	# Reading files
	df = wr.s3.read_csv(f"s3://sample-bucket/sample.csv") #you can optionally select a subset of columns with names=['col_name1'] and parse date cols with: parse_dates=["col_name2"]
	df = wr.s3.read_json(f"s3://sample-bucket/sample.json")
	df = wr.s3.read_parquet(f"s3://sample-bucket/sample.parquet") # you can read individual parquet files or every parquet file in a subfolder

	df = wr.athena.read_sql_query("""SELECT * FROM table_name limit 100""", database="test_database")

	# Writing files
	wr.s3.to_csv(df, f"s3://sample-bucket/sample_2.csv", index=False) # note there’s a specific CSV<>Athena function below
	wr.s3.to_json(df, f"s3://sample-bucket/sample_2.json")
	wr.s3.to_parquet(df, f"s3://sample-bucket/sample_2.parquet")

	# Retrieve databases in Glue catalog
	wr.catalog.databases()

	# Retrieve tables in a Glue database
	wr.catalog.tables(database="sample_db")

	# Search a table in Glue database
	wr.catalog.tables(name_contains="sample")
	wr.catalog.tables(name_prefix="sample_")
	wr.catalog.tables(name_suffix="_table")
	wr.catalog.tables(search_text="table description")

	# Retrieve table details
	wr.catalog.table(database="sample_db", table="sample_table")

	# Delete a table
	wr.catalog.delete_table_if_exists(database="sample_db", table="sample_table")

	# Delete a database
	wr.catalog.delete_database('sample_db')

	# Get SQLAlchemy Engine from a Glue Catalog Connection
	wr.catalog.get_engine(name='sample_connection')

	# Get SQLAlchemy Engine using the given db parameters
	postgres_engine = wr.db.get_engine(
	db_type="postgresql",
	host="127.0.0.1",
	port=5432,
	database="sample_db",
	user="sample_user",
	password="sample_password"
	)
	# Reading data from table in database
	wr.db.read_sql_query("SELECT * FROM public.sample_table", con=postgres_engine)

	# Writing data into table in database
	wr.db.to_sql(df, postgres_engine, schema="public", name="sample_table", if_exists="replace", index=False)

	# Load data into redshift using COPY command
	wr.db.copy_to_redshift(
	df=df,
	path=path,
	con=redshift_engine,
	schema="public",
	table="sample_table",
	mode="overwrite",
	iam_role=iam_role
	)

	# Unload data from redshift using UNLOAD command
	wr.db.unload_redshift(
	sql="SELECT * FROM public.sample_table",
	con=redshift_engine,
	iam_role=iam_role,
	path=path,
	keep_files=True
	)

	# Read data from Athena using SQL query
	wr.athena.read_sql_query("SELECT * FROM sample_table", database="sample_db")

	# Read data from Athena using SQL query using specific chunksize
	df_chunks = wr.athena.read_sql_query(
	"SELECT * FROM sample_table",
	database="sample_db",
	chunksize=10000, # or True to let wrangler ddecide for you
	categories = ["col1", "col2] # can improve performance a lot
	)
	for df in dfs:
	print(len(df.index))

	# merging the underlying s3 paths of two datasets into one
	wr.s3.merge_datasets(
	source_path=path2,
	target_path=path1,
	mode="append"
	)

	# use wr.catalog.extract_athena_types() to get partitions_types and columns_types
	wr.catalog.create_csv_table(
	table="csv_crawler",
	database="awswrangler_test",
	path=path,
	partitions_types=partitions_types,
	columns_types=columns_types
	)