Skip to content

Instantly share code, notes, and snippets.

@cobookman
Last active September 8, 2020 07:20
Show Gist options
  • Save cobookman/0dd1ef504da96502cb7e2ad1488f6866 to your computer and use it in GitHub Desktop.
Save cobookman/0dd1ef504da96502cb7e2ad1488f6866 to your computer and use it in GitHub Desktop.
Dump AWS Glue metadata
Lab Doc
https://docs.google.com/document/d/1Kw4dhhjFLvRXUMZzr9HsIDt8mQfccXopP9jbPdd5NX8/edit?usp=sharing
"""Dumps Databases, Tables, and Partitions from AWS Glue."""
import logging
import sys
import boto3
import json
# Push logs to STDERR.
logging.basicConfig(stream=sys.stderr, level=logging.ERROR)
def main():
# AWS Credentials of user with Glue GetDatabase/GetDatabases/GetPartition/GetPartitions/GetTable/GetTables IAM permissions
ACCESS_KEY='****'
SECRET_KEY='****'
REGION='us-west-2'
# CatalogId(string): is the AWS Account ID for where GLUE catalog resides.
# If CatalogId is not provided, the Authenticated User's AWS Account ID is used by default.
CATALOG_ID=AWS_ACCOUNT_ID='***'
glue_us_west2 = boto3.client(
service_name='glue',
region_name='us-west-2',
aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY)
databases = get_glue_databases(glue_us_west2, CATALOG_ID)
for database in databases:
tables = get_glue_tables(glue_us_west2, CATALOG_ID, database['Name'])
for table in tables:
partitions = get_glue_partitions(glue_us_west2, CATALOG_ID, database['Name'], table['Name'])
table['Partitions'] = partitions
database['Tables'] = tables
print(json.dumps({'Databases': databases}, indent=2, sort_keys=True, default=str))
def get_glue_partitions(glue_client, catalog_id, database_name, table_name):
"""Grabs all partitions for a table, and iterates through pagination."""
partitions = []
next_token = ''
while True: # Emulating do-while. Python doesn't have do-while loops :(
resp = glue_client.get_partitions(
CatalogId=catalog_id,
DatabaseName=database_name,
TableName=table_name,
NextToken=next_token)
partitions += resp['Partitions']
if 'NextToken' not in resp:
break
next_token = resp['NextToken']
return partitions
def get_glue_tables(glue_client, catalog_id, database_name):
"""Grabs all glue tables in database, and iterates through pagination."""
tables = []
next_token = ''
while True: # Emulating do-while. Python doesn't have do-while loops :(
resp = glue_client.get_tables(
CatalogId=catalog_id,
DatabaseName=database_name,
NextToken=next_token)
tables += resp['TableList']
if 'NextToken' not in resp:
break
next_token = resp['NextToken']
return tables
def get_glue_databases(glue_client, catalog_id):
"""Grabs all glue databases, and iterates through pagination."""
databases = []
next_token = ''
while True: # Emulating do-while. Python doesn't have do-while loops :(
resp = glue_client.get_databases(CatalogId=catalog_id, NextToken="")
databases += resp['DatabaseList']
if 'NextToken' not in resp:
break
next_token = databases['NextToken']
return databases
if __name__ == '__main__':
main()
@cobookman
Copy link
Author

**Note, if there's a substantial number of Tables / databases. You might want to leverage a worker thread pool to shard the GetTables / GetPartitions requests across many threads.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment