Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save alecbw/db9a1e992a17be682ec833bd38248e18 to your computer and use it in GitHub Desktop.
Save alecbw/db9a1e992a17be682ec833bd38248e18 to your computer and use it in GitHub Desktop.
Useful snippets from the docs with some commentary in the comments
import awswrangler as wr
import pandas as pd
get current IAM role/user
name = wr.sts.get_current_identity_name()
arn = wr.sts.get_current_identity_arn()
# Reading files
df = wr.s3.read_csv(f"s3://sample-bucket/sample.csv") #you can optionally select a subset of columns with names=['col_name1'] and parse date cols with: parse_dates=["col_name2"]
df = wr.s3.read_json(f"s3://sample-bucket/sample.json")
df = wr.s3.read_parquet(f"s3://sample-bucket/sample.parquet") # you can read individual parquet files or every parquet file in a subfolder
df = wr.athena.read_sql_query("""SELECT * FROM table_name limit 100""", database="test_database")
# Writing files
wr.s3.to_csv(df, f"s3://sample-bucket/sample_2.csv", index=False) # note there’s a specific CSV<>Athena function below
wr.s3.to_json(df, f"s3://sample-bucket/sample_2.json")
wr.s3.to_parquet(df, f"s3://sample-bucket/sample_2.parquet")
# Retrieve databases in Glue catalog
wr.catalog.databases()
# Retrieve tables in a Glue database
wr.catalog.tables(database="sample_db")
# Search a table in Glue database
wr.catalog.tables(name_contains="sample")
wr.catalog.tables(name_prefix="sample_")
wr.catalog.tables(name_suffix="_table")
wr.catalog.tables(search_text="table description")
# Retrieve table details
wr.catalog.table(database="sample_db", table="sample_table")
# Delete a table
wr.catalog.delete_table_if_exists(database="sample_db", table="sample_table")
# Delete a database
wr.catalog.delete_database('sample_db')
# Get SQLAlchemy Engine from a Glue Catalog Connection
wr.catalog.get_engine(name='sample_connection')
# Get SQLAlchemy Engine using the given db parameters
postgres_engine = wr.db.get_engine(
db_type="postgresql",
host="127.0.0.1",
port=5432,
database="sample_db",
user="sample_user",
password="sample_password"
)
# Reading data from table in database
wr.db.read_sql_query("SELECT * FROM public.sample_table", con=postgres_engine)
# Writing data into table in database
wr.db.to_sql(df, postgres_engine, schema="public", name="sample_table", if_exists="replace", index=False)
# Load data into redshift using COPY command
wr.db.copy_to_redshift(
df=df,
path=path,
con=redshift_engine,
schema="public",
table="sample_table",
mode="overwrite",
iam_role=iam_role
)
# Unload data from redshift using UNLOAD command
wr.db.unload_redshift(
sql="SELECT * FROM public.sample_table",
con=redshift_engine,
iam_role=iam_role,
path=path,
keep_files=True
)
# Read data from Athena using SQL query
wr.athena.read_sql_query("SELECT * FROM sample_table", database="sample_db")
# Read data from Athena using SQL query using specific chunksize
df_chunks = wr.athena.read_sql_query(
"SELECT * FROM sample_table",
database="sample_db",
chunksize=10000, # or True to let wrangler ddecide for you
categories = ["col1", "col2] # can improve performance a lot
)
for df in dfs:
print(len(df.index))
# merging the underlying s3 paths of two datasets into one
wr.s3.merge_datasets(
source_path=path2,
target_path=path1,
mode="append"
)
# use wr.catalog.extract_athena_types() to get partitions_types and columns_types
wr.catalog.create_csv_table(
table="csv_crawler",
database="awswrangler_test",
path=path,
partitions_types=partitions_types,
columns_types=columns_types
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment