Created
December 29, 2020 20:26
-
-
Save alecbw/db9a1e992a17be682ec833bd38248e18 to your computer and use it in GitHub Desktop.
Useful snippets from the docs with some commentary in the comments
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import awswrangler as wr | |
import pandas as pd | |
get current IAM role/user | |
name = wr.sts.get_current_identity_name() | |
arn = wr.sts.get_current_identity_arn() | |
# Reading files | |
df = wr.s3.read_csv(f"s3://sample-bucket/sample.csv") #you can optionally select a subset of columns with names=['col_name1'] and parse date cols with: parse_dates=["col_name2"] | |
df = wr.s3.read_json(f"s3://sample-bucket/sample.json") | |
df = wr.s3.read_parquet(f"s3://sample-bucket/sample.parquet") # you can read individual parquet files or every parquet file in a subfolder | |
df = wr.athena.read_sql_query("""SELECT * FROM table_name limit 100""", database="test_database") | |
# Writing files | |
wr.s3.to_csv(df, f"s3://sample-bucket/sample_2.csv", index=False) # note there’s a specific CSV<>Athena function below | |
wr.s3.to_json(df, f"s3://sample-bucket/sample_2.json") | |
wr.s3.to_parquet(df, f"s3://sample-bucket/sample_2.parquet") | |
# Retrieve databases in Glue catalog | |
wr.catalog.databases() | |
# Retrieve tables in a Glue database | |
wr.catalog.tables(database="sample_db") | |
# Search a table in Glue database | |
wr.catalog.tables(name_contains="sample") | |
wr.catalog.tables(name_prefix="sample_") | |
wr.catalog.tables(name_suffix="_table") | |
wr.catalog.tables(search_text="table description") | |
# Retrieve table details | |
wr.catalog.table(database="sample_db", table="sample_table") | |
# Delete a table | |
wr.catalog.delete_table_if_exists(database="sample_db", table="sample_table") | |
# Delete a database | |
wr.catalog.delete_database('sample_db') | |
# Get SQLAlchemy Engine from a Glue Catalog Connection | |
wr.catalog.get_engine(name='sample_connection') | |
# Get SQLAlchemy Engine using the given db parameters | |
postgres_engine = wr.db.get_engine( | |
db_type="postgresql", | |
host="127.0.0.1", | |
port=5432, | |
database="sample_db", | |
user="sample_user", | |
password="sample_password" | |
) | |
# Reading data from table in database | |
wr.db.read_sql_query("SELECT * FROM public.sample_table", con=postgres_engine) | |
# Writing data into table in database | |
wr.db.to_sql(df, postgres_engine, schema="public", name="sample_table", if_exists="replace", index=False) | |
# Load data into redshift using COPY command | |
wr.db.copy_to_redshift( | |
df=df, | |
path=path, | |
con=redshift_engine, | |
schema="public", | |
table="sample_table", | |
mode="overwrite", | |
iam_role=iam_role | |
) | |
# Unload data from redshift using UNLOAD command | |
wr.db.unload_redshift( | |
sql="SELECT * FROM public.sample_table", | |
con=redshift_engine, | |
iam_role=iam_role, | |
path=path, | |
keep_files=True | |
) | |
# Read data from Athena using SQL query | |
wr.athena.read_sql_query("SELECT * FROM sample_table", database="sample_db") | |
# Read data from Athena using SQL query using specific chunksize | |
df_chunks = wr.athena.read_sql_query( | |
"SELECT * FROM sample_table", | |
database="sample_db", | |
chunksize=10000, # or True to let wrangler ddecide for you | |
categories = ["col1", "col2] # can improve performance a lot | |
) | |
for df in dfs: | |
print(len(df.index)) | |
# merging the underlying s3 paths of two datasets into one | |
wr.s3.merge_datasets( | |
source_path=path2, | |
target_path=path1, | |
mode="append" | |
) | |
# use wr.catalog.extract_athena_types() to get partitions_types and columns_types | |
wr.catalog.create_csv_table( | |
table="csv_crawler", | |
database="awswrangler_test", | |
path=path, | |
partitions_types=partitions_types, | |
columns_types=columns_types | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment