Skip to content

Instantly share code, notes, and snippets.

@t04glovern
Last active February 5, 2024 04:46
Show Gist options
  • Save t04glovern/8f7766b9b38e3d8b5bde6ca896fc3af1 to your computer and use it in GitHub Desktop.
Save t04glovern/8f7766b9b38e3d8b5bde6ca896fc3af1 to your computer and use it in GitHub Desktop.
Example of using PyIceberg API with an existing iceberg table (created with https://gist.github.com/t04glovern/04f6f2934353eb1d0fffd487e9b9b6a3).
#!/usr/bin/env python3
#
# -- Run other script to create the Iceberg table
#
# pip install boto3
# curl https://gist.githubusercontent.com/t04glovern/04f6f2934353eb1d0fffd487e9b9b6a3/raw \
# > lets-try-iceberg.py \
# && chmod +x lets-try-iceberg.py
# ./lets-try-iceberg.py --table lets_try_iceberg
#
# -- Run this script to connect to the Iceberg table using PyIceberg
#
# pip install pyarrow pyiceberg
# ./lets-try-pyiceberg.py --bucket <bucket-name>
import argparse
import logging
import os
from pyiceberg.catalog import load_catalog
logging.basicConfig(level=logging.INFO)
aws_region: str = "us-west-2"
# Required for pyarrow to work with S3
os.environ['AWS_DEFAULT_REGION'] = aws_region
def main(bucket_name: str):
# Configure the catalog
catalog = load_catalog(
'default',
**{
'type': 'glue',
'uri': 's3://{bucket_name}'.format(bucket_name=bucket_name),
'region_name': aws_region
}
)
# List the namespaces
namespaces = catalog.list_namespaces()
logging.info(f'Namespaces: {namespaces}')
# List the tables (assumes the default namespace)
tables = catalog.list_tables('default')
logging.info(f'Tables: {tables}')
# Load a table
table = catalog.load_table(('default', 'lets_try_iceberg'))
# Scan the table
scan = table.scan(
selected_fields=('id', 'timestamp', 'speed', 'temperature', 'location'),
limit=10
).to_arrow()
# Print the results
logging.info(f'Scan: {scan}')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="PyIceberg sample script")
parser.add_argument(
"--bucket",
type=str,
help="The S3 bucket name where the existing Iceberg table is stored.",
required=True,
)
args = parser.parse_args()
bucket_name = args.bucket
main(bucket_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment