Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save lamb-russell/4f42891c55425ec29fb23593eecbd5cd to your computer and use it in GitHub Desktop.
Save lamb-russell/4f42891c55425ec29fb23593eecbd5cd to your computer and use it in GitHub Desktop.
DuckDb and PyIceberg example
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"1aba39ae35bc4fb2b88adc346e18568a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_a8f093b1c6814577a24d56a2bee85a2d",
"max": 100,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_f1670a9277de47b2a371d87bfdd2d2ca",
"value": 100
}
},
"a8f093b1c6814577a24d56a2bee85a2d": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": "auto"
}
},
"f1670a9277de47b2a371d87bfdd2d2ca": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": "black",
"description_width": ""
}
},
"a3feb983d94d4b6b9a6900e994387d90": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_19c3c4eae073475188884cc8421ac4e4",
"max": 100,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_832a49bbebdc49ceaba1de536c218566",
"value": 100
}
},
"19c3c4eae073475188884cc8421ac4e4": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": "auto"
}
},
"832a49bbebdc49ceaba1de536c218566": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": "black",
"description_width": ""
}
}
}
}
},
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "NTf__OFI2lC2"
},
"outputs": [],
"source": [
"!pip install --quiet duckdb"
]
},
{
"cell_type": "code",
"source": [
"!pip install --quiet pyiceberg"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "W9e4XgUBwUuo",
"outputId": "29d6f42f-3ed6-4eda-aa1a-9a2b9eee0668"
},
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m969.9/969.9 kB\u001b[0m \u001b[31m4.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m123.9/123.9 kB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h"
]
}
]
},
{
"cell_type": "code",
"source": [
"!pip install sqlalchemy"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "TlRyIpJdxOt1",
"outputId": "23ee8a73-9ec1-41ee-d078-9b88e8bbabae"
},
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: sqlalchemy in /usr/local/lib/python3.10/dist-packages (2.0.31)\n",
"Requirement already satisfied: typing-extensions>=4.6.0 in /usr/local/lib/python3.10/dist-packages (from sqlalchemy) (4.12.2)\n",
"Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from sqlalchemy) (3.0.3)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import logging\n",
"logging.basicConfig(level=logging.DEBUG)"
],
"metadata": {
"id": "25ED4iUnMbqf"
},
"execution_count": 4,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import duckdb\n",
"\n",
"duckdb.sql(\"SELECT 1\").df\n"
],
"metadata": {
"id": "jgn93mtb3AS1",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "6f3a48be-7690-44c5-d7d2-cebf16fa0291"
},
"execution_count": 5,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<bound method PyCapsule.df of ┌───────┐\n",
"│ 1 │\n",
"│ int32 │\n",
"├───────┤\n",
"│ 1 │\n",
"└───────┘\n",
">"
]
},
"metadata": {},
"execution_count": 5
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "GNDv5_GnEqVo"
},
"execution_count": 5,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Create an Iceberg Catalog\n",
"Use sqlalchemy to create a sqlite catalog for iceberg. Create a table in iceberg and load the taxi dataset."
],
"metadata": {
"id": "nCGwS0I8K6Ko"
}
},
{
"cell_type": "code",
"source": [
"!mkdir \"/tmp/warehouse\" # make directory for catalog SqlLite file"
],
"metadata": {
"id": "vydzcwjiFOAY"
},
"execution_count": 6,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from pyiceberg.catalog.sql import SqlCatalog\n",
"\n",
"warehouse_path = \"/tmp/warehouse\"\n",
"catalog = SqlCatalog(\n",
" \"default\",\n",
" **{\n",
" \"uri\": f\"sqlite:///{warehouse_path}/pyiceberg_catalog.db\",\n",
" \"warehouse\": f\"file://{warehouse_path}\",\n",
" },\n",
")"
],
"metadata": {
"id": "HHJdYFAgFTof"
},
"execution_count": 7,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Load Data into a Table\n",
"Load taxi data into a iceberg table"
],
"metadata": {
"id": "r0XFuTkx6rcV"
}
},
{
"cell_type": "code",
"source": [
"arrow_df = duckdb.sql(\"select * from read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')\").arrow()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 49,
"referenced_widgets": [
"1aba39ae35bc4fb2b88adc346e18568a",
"a8f093b1c6814577a24d56a2bee85a2d",
"f1670a9277de47b2a371d87bfdd2d2ca"
]
},
"id": "aLwhWCs0GAuH",
"outputId": "bb575d4f-dfc8-4171-b71e-f8cabfe28d6a"
},
"execution_count": 8,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "1aba39ae35bc4fb2b88adc346e18568a"
}
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"source": [
"schema = arrow_df.schema\n",
"schema"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "97RLi8PIGIMB",
"outputId": "296a6798-d337-43f3-9bc3-e0a9eece40ae"
},
"execution_count": 9,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"VendorID: int64\n",
"tpep_pickup_datetime: timestamp[us]\n",
"tpep_dropoff_datetime: timestamp[us]\n",
"passenger_count: double\n",
"trip_distance: double\n",
"RatecodeID: double\n",
"store_and_fwd_flag: string\n",
"PULocationID: int64\n",
"DOLocationID: int64\n",
"payment_type: int64\n",
"fare_amount: double\n",
"extra: double\n",
"mta_tax: double\n",
"tip_amount: double\n",
"tolls_amount: double\n",
"improvement_surcharge: double\n",
"total_amount: double\n",
"congestion_surcharge: double\n",
"airport_fee: double"
]
},
"metadata": {},
"execution_count": 9
}
]
},
{
"cell_type": "code",
"source": [
"# create the iceberg table\n",
"\n",
"from pyiceberg.exceptions import NoSuchNamespaceError, NamespaceAlreadyExistsError, TableAlreadyExistsError\n",
"\n",
"# Define the table name and namespace\n",
"namespace = \"taxi_namespace\"\n",
"table_name = \"taxi_trips\"\n",
"\n",
"# Create the namespace if it doesn't exist\n",
"try:\n",
" catalog.create_namespace(namespace)\n",
"except NamespaceAlreadyExistsError:\n",
" print(\"Namespace already exists.\")\n",
" pass # Namespace already exists\n",
"\n",
"# Create or load the Iceberg table\n",
"try:\n",
" table = catalog.create_table(f\"{namespace}.{table_name}\", schema)\n",
"except TableAlreadyExistsError:\n",
" table = catalog.load_table(f\"{namespace}.{table_name}\")\n",
" print(\"Iceberg table already exists.\")"
],
"metadata": {
"id": "TW-4s9WlFgOM"
},
"execution_count": 10,
"outputs": []
},
{
"cell_type": "code",
"source": [
"table.overwrite(arrow_df) # the data frame to the iceberg dataset"
],
"metadata": {
"id": "_zra0MQUv2d5"
},
"execution_count": 11,
"outputs": []
},
{
"cell_type": "code",
"source": [
"len(table.scan().to_arrow())\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "grT3hGJVKIOf",
"outputId": "08a05884-6c1a-4337-e0c3-d307e936175c"
},
"execution_count": 12,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"3066766"
]
},
"metadata": {},
"execution_count": 12
}
]
},
{
"cell_type": "code",
"source": [
"df=table.scan().to_pandas()\n"
],
"metadata": {
"id": "zb--g4VtKQJi"
},
"execution_count": 13,
"outputs": []
},
{
"cell_type": "markdown",
"source": [],
"metadata": {
"id": "1mb4GLeuLdtn"
}
},
{
"cell_type": "code",
"source": [
"r=duckdb.register(\"duck_test\",df)\n"
],
"metadata": {
"id": "w0qsEHTILWJn"
},
"execution_count": 14,
"outputs": []
},
{
"cell_type": "code",
"source": [
"duckdb.sql(\"select * from duck_test\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "A_d_VPJJKdds",
"outputId": "5c9b2533-ff3a-4b9f-b828-1841d4992c38"
},
"execution_count": 15,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"┌──────────┬──────────────────────┬──────────────────────┬───┬──────────────┬──────────────────────┬─────────────┐\n",
"│ VendorID │ tpep_pickup_datetime │ tpep_dropoff_datet… │ … │ total_amount │ congestion_surcharge │ airport_fee │\n",
"│ int64 │ timestamp │ timestamp │ │ double │ double │ double │\n",
"├──────────┼──────────────────────┼──────────────────────┼───┼──────────────┼──────────────────────┼─────────────┤\n",
"│ 2 │ 2023-01-01 00:32:10 │ 2023-01-01 00:40:36 │ … │ 14.3 │ 2.5 │ 0.0 │\n",
"│ 2 │ 2023-01-01 00:55:08 │ 2023-01-01 01:01:27 │ … │ 16.9 │ 2.5 │ 0.0 │\n",
"│ 2 │ 2023-01-01 00:25:04 │ 2023-01-01 00:37:49 │ … │ 34.9 │ 2.5 │ 0.0 │\n",
"│ 1 │ 2023-01-01 00:03:48 │ 2023-01-01 00:13:25 │ … │ 20.85 │ 0.0 │ 1.25 │\n",
"│ 2 │ 2023-01-01 00:10:29 │ 2023-01-01 00:21:19 │ … │ 19.68 │ 2.5 │ 0.0 │\n",
"│ 2 │ 2023-01-01 00:50:34 │ 2023-01-01 01:02:52 │ … │ 27.8 │ 2.5 │ 0.0 │\n",
"│ 2 │ 2023-01-01 00:09:22 │ 2023-01-01 00:19:49 │ … │ 20.52 │ 2.5 │ 0.0 │\n",
"│ 2 │ 2023-01-01 00:27:12 │ 2023-01-01 00:49:56 │ … │ 64.44 │ 2.5 │ 0.0 │\n",
"│ 2 │ 2023-01-01 00:21:44 │ 2023-01-01 00:36:40 │ … │ 28.38 │ 2.5 │ 0.0 │\n",
"│ 2 │ 2023-01-01 00:39:42 │ 2023-01-01 00:50:36 │ … │ 19.9 │ 2.5 │ 0.0 │\n",
"│ · │ · │ · │ · │ · │ · │ · │\n",
"│ · │ · │ · │ · │ · │ · │ · │\n",
"│ · │ · │ · │ · │ · │ · │ · │\n",
"│ 2 │ 2023-01-01 01:28:24 │ 2023-01-01 01:56:38 │ … │ 42.3 │ 2.5 │ 0.0 │\n",
"│ 2 │ 2023-01-01 01:25:30 │ 2023-01-01 01:46:33 │ … │ 35.6 │ 2.5 │ 0.0 │\n",
"│ 2 │ 2023-01-01 01:49:47 │ 2023-01-01 02:01:27 │ … │ 26.63 │ 2.5 │ 0.0 │\n",
"│ 2 │ 2023-01-01 01:13:11 │ 2023-01-01 01:16:31 │ … │ 14.04 │ 2.5 │ 0.0 │\n",
"│ 2 │ 2023-01-01 01:22:22 │ 2023-01-01 01:33:19 │ … │ 21.36 │ 2.5 │ 0.0 │\n",
"│ 2 │ 2023-01-01 01:37:42 │ 2023-01-01 01:49:29 │ … │ 17.8 │ 2.5 │ 0.0 │\n",
"│ 2 │ 2023-01-01 01:51:17 │ 2023-01-01 02:02:07 │ … │ 22.9 │ 2.5 │ 0.0 │\n",
"│ 2 │ 2023-01-01 01:49:24 │ 2023-01-01 02:17:26 │ … │ 33.2 │ 2.5 │ 0.0 │\n",
"│ 1 │ 2023-01-01 01:16:50 │ 2023-01-01 01:23:29 │ … │ 10.4 │ 0.0 │ 0.0 │\n",
"│ 1 │ 2023-01-01 01:26:48 │ 2023-01-01 01:35:56 │ … │ 13.9 │ 0.0 │ 0.0 │\n",
"├──────────┴──────────────────────┴──────────────────────┴───┴──────────────┴──────────────────────┴─────────────┤\n",
"│ ? rows (>9999 rows, 20 shown) 19 columns (6 shown) │\n",
"└────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘"
]
},
"metadata": {},
"execution_count": 15
}
]
},
{
"cell_type": "code",
"source": [
"duckdb.sql(\"SELECT * FROM information_schema.tables ;\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "20EPIneOLE04",
"outputId": "e3dca695-2110-48a2-8825-7de6e0083157"
},
"execution_count": 16,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"┌───────────────┬──────────────┬────────────┬───┬────────────────────┬──────────┬───────────────┬───────────────┐\n",
"│ table_catalog │ table_schema │ table_name │ … │ is_insertable_into │ is_typed │ commit_action │ TABLE_COMMENT │\n",
"│ varchar │ varchar │ varchar │ │ varchar │ varchar │ varchar │ varchar │\n",
"├───────────────┼──────────────┼────────────┼───┼────────────────────┼──────────┼───────────────┼───────────────┤\n",
"│ temp │ main │ duck_test │ … │ NO │ NO │ NULL │ NULL │\n",
"├───────────────┴──────────────┴────────────┴───┴────────────────────┴──────────┴───────────────┴───────────────┤\n",
"│ 1 rows 13 columns (7 shown) │\n",
"└───────────────────────────────────────────────────────────────────────────────────────────────────────────────┘"
]
},
"metadata": {},
"execution_count": 16
}
]
},
{
"cell_type": "markdown",
"source": [
"# Use PyIceberg CLI to read\n",
"Use the pyiceberg cli to read describe the newly created table"
],
"metadata": {
"id": "mSlEj5GSP22M"
}
},
{
"cell_type": "code",
"source": [
"!pyiceberg --help"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "toT0G8eQM6UX",
"outputId": "eb7c9566-a020-4789-ca39-0ad850a106a1"
},
"execution_count": 17,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Usage: pyiceberg [OPTIONS] COMMAND [ARGS]...\n",
"\n",
"Options:\n",
" --catalog TEXT\n",
" --verbose BOOLEAN\n",
" --output [text|json]\n",
" --uri TEXT\n",
" --credential TEXT\n",
" --help Show this message and exit.\n",
"\n",
"Commands:\n",
" create Operation to create a namespace.\n",
" describe Describe a namespace or a table.\n",
" drop Operations to drop a namespace or table.\n",
" files List all the files of the table.\n",
" list List tables or namespaces.\n",
" list-refs List all the refs in the provided table.\n",
" location Return the location of the table.\n",
" properties Properties on tables/namespaces.\n",
" rename Rename a table.\n",
" schema Get the schema of the table.\n",
" spec Return the partition spec of the table.\n",
" uuid Return the UUID of the table.\n",
" version Print pyiceberg version.\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!pyiceberg --catalog default --uri sqlite:////tmp/warehouse/pyiceberg_catalog.db list"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "xCVzpKzZM_Vp",
"outputId": "a29cf4e9-6c5d-485b-e620-c0ed579c8646"
},
"execution_count": 18,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"taxi_namespace\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!pyiceberg --catalog default --uri sqlite:////tmp/warehouse/pyiceberg_catalog.db describe taxi_namespace.taxi_trips"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "xx4Vb1hINy9M",
"outputId": "faa5e585-cceb-4999-d843-9aa14e623d45"
},
"execution_count": 19,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Table format version 2 \n",
"Metadata location file:///tmp/warehouse/taxi_namespace.db/taxi_trips/metadata/00001-86911ea9-b1…\n",
"Table UUID 14d1f1e3-c7d5-4099-ba01-db72e6d43773 \n",
"Last Updated 1721786211755 \n",
"Partition spec [] \n",
"Sort order [] \n",
"Current schema Schema, id=0 \n",
" ├── 1: VendorID: optional long \n",
" ├── 2: tpep_pickup_datetime: optional timestamp \n",
" ├── 3: tpep_dropoff_datetime: optional timestamp \n",
" ├── 4: passenger_count: optional double \n",
" ├── 5: trip_distance: optional double \n",
" ├── 6: RatecodeID: optional double \n",
" ├── 7: store_and_fwd_flag: optional string \n",
" ├── 8: PULocationID: optional long \n",
" ├── 9: DOLocationID: optional long \n",
" ├── 10: payment_type: optional long \n",
" ├── 11: fare_amount: optional double \n",
" ├── 12: extra: optional double \n",
" ├── 13: mta_tax: optional double \n",
" ├── 14: tip_amount: optional double \n",
" ├── 15: tolls_amount: optional double \n",
" ├── 16: improvement_surcharge: optional double \n",
" ├── 17: total_amount: optional double \n",
" ├── 18: congestion_surcharge: optional double \n",
" └── 19: airport_fee: optional double \n",
"Current snapshot Operation.APPEND: id=5984614148918088538, schema_id=0 \n",
"Snapshots Snapshots \n",
" └── Snapshot 5984614148918088538, schema 0: \n",
" file:///tmp/warehouse/taxi_namespace.db/taxi_trips/metadata/snap-59846141…\n",
"Properties \n"
]
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "5LjvcWdWQQ5B"
},
"execution_count": 19,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Read from the catalog using PyIceberg"
],
"metadata": {
"id": "RQRyWG3Ddnjm"
}
},
{
"cell_type": "code",
"source": [
"from pyiceberg.catalog import load_catalog\n",
"\n",
"read_catalog=load_catalog(\n",
" \"default\",\n",
" **{\n",
" \"uri\": \"sqlite:////tmp/warehouse/pyiceberg_catalog.db\",\n",
" \"py-io-impl\": \"pyiceberg.io.pyarrow.PyArrowFileIO\",\n",
"\n",
" }\n",
")\n",
"\n",
"read_catalog.list_namespaces() # list all namespaces"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "hnz1aN_PQDZk",
"outputId": "62a1cb9b-a70b-4816-c3a9-611508d8ce87"
},
"execution_count": 24,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[('taxi_namespace',)]"
]
},
"metadata": {},
"execution_count": 24
}
]
},
{
"cell_type": "code",
"source": [
"read_catalog.list_tables(\"taxi_namespace\") # list all tables in the namespace"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Mmwx7mvndr22",
"outputId": "60a00c90-82ce-46d4-806b-8d62258390ea"
},
"execution_count": 25,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[('taxi_namespace', 'taxi_trips')]"
]
},
"metadata": {},
"execution_count": 25
}
]
},
{
"cell_type": "code",
"source": [
"\n",
"read_table = read_catalog.load_table(\"taxi_namespace.taxi_trips\")\n",
"read_table"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "yhg4yI6kd5fD",
"outputId": "4d52f8ff-c7ff-4283-b179-b704bdc3852b"
},
"execution_count": 30,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"taxi_trips(\n",
" 1: VendorID: optional long,\n",
" 2: tpep_pickup_datetime: optional timestamp,\n",
" 3: tpep_dropoff_datetime: optional timestamp,\n",
" 4: passenger_count: optional double,\n",
" 5: trip_distance: optional double,\n",
" 6: RatecodeID: optional double,\n",
" 7: store_and_fwd_flag: optional string,\n",
" 8: PULocationID: optional long,\n",
" 9: DOLocationID: optional long,\n",
" 10: payment_type: optional long,\n",
" 11: fare_amount: optional double,\n",
" 12: extra: optional double,\n",
" 13: mta_tax: optional double,\n",
" 14: tip_amount: optional double,\n",
" 15: tolls_amount: optional double,\n",
" 16: improvement_surcharge: optional double,\n",
" 17: total_amount: optional double,\n",
" 18: congestion_surcharge: optional double,\n",
" 19: airport_fee: optional double\n",
"),\n",
"partition by: [],\n",
"sort order: [],\n",
"snapshot: Operation.APPEND: id=5984614148918088538, schema_id=0"
]
},
"metadata": {},
"execution_count": 30
}
]
},
{
"cell_type": "markdown",
"source": [
"# Scan able to Pandas\n",
"Read data from the parquet files to a pandas data frame."
],
"metadata": {
"id": "TypJdWbxe5kg"
}
},
{
"cell_type": "code",
"source": [
"from pyiceberg.expressions import GreaterThanOrEqual\n",
"scan = table.scan(\n",
" row_filter=GreaterThanOrEqual(\"trip_distance\", 10.0),\n",
" selected_fields=(\"VendorID\", \"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"),\n",
" limit=100,\n",
")\n",
"\n",
"[task.file.file_path for task in scan.plan_files()] # show the file that contains the rows\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "fd5h-Gl7d-ti",
"outputId": "244a0df3-0d4f-4315-b19c-bd8eb8c337e8"
},
"execution_count": 42,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"<pyiceberg.table.DataScan object at 0x7b94e2cb2200>\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['file:///tmp/warehouse/taxi_namespace.db/taxi_trips/data/00000-0-4d92790a-a45b-4e41-807e-0299bbe7d87d.parquet']"
]
},
"metadata": {},
"execution_count": 42
}
]
},
{
"cell_type": "code",
"source": [
"\n",
"scan_df = scan.to_pandas() # read the data to a pandas data frame\n",
"scan_df"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 424
},
"id": "9Q30vYlNfTvS",
"outputId": "9ff898f1-5728-454d-9bc1-1c47904462b0"
},
"execution_count": 43,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" VendorID tpep_pickup_datetime tpep_dropoff_datetime\n",
"0 2 2023-01-01 00:27:12 2023-01-01 00:49:56\n",
"1 2 2023-01-01 00:09:29 2023-01-01 00:29:23\n",
"2 1 2023-01-01 00:13:30 2023-01-01 00:44:00\n",
"3 2 2023-01-01 00:41:41 2023-01-01 01:19:32\n",
"4 2 2023-01-01 00:22:39 2023-01-01 01:30:45\n",
".. ... ... ...\n",
"95 2 2023-01-01 00:56:24 2023-01-01 01:26:29\n",
"96 2 2023-01-01 00:55:38 2023-01-01 01:25:34\n",
"97 2 2023-01-01 00:13:36 2023-01-01 00:48:23\n",
"98 2 2023-01-01 00:51:18 2023-01-01 01:11:18\n",
"99 2 2023-01-01 00:27:34 2023-01-01 01:05:05\n",
"\n",
"[100 rows x 3 columns]"
],
"text/html": [
"\n",
" <div id=\"df-610138c9-cd6d-4c79-bb67-3ea11f3633df\" class=\"colab-df-container\">\n",
" <div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>VendorID</th>\n",
" <th>tpep_pickup_datetime</th>\n",
" <th>tpep_dropoff_datetime</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2</td>\n",
" <td>2023-01-01 00:27:12</td>\n",
" <td>2023-01-01 00:49:56</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>2023-01-01 00:09:29</td>\n",
" <td>2023-01-01 00:29:23</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>2023-01-01 00:13:30</td>\n",
" <td>2023-01-01 00:44:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2</td>\n",
" <td>2023-01-01 00:41:41</td>\n",
" <td>2023-01-01 01:19:32</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>2023-01-01 00:22:39</td>\n",
" <td>2023-01-01 01:30:45</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95</th>\n",
" <td>2</td>\n",
" <td>2023-01-01 00:56:24</td>\n",
" <td>2023-01-01 01:26:29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>2</td>\n",
" <td>2023-01-01 00:55:38</td>\n",
" <td>2023-01-01 01:25:34</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>2</td>\n",
" <td>2023-01-01 00:13:36</td>\n",
" <td>2023-01-01 00:48:23</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>2</td>\n",
" <td>2023-01-01 00:51:18</td>\n",
" <td>2023-01-01 01:11:18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>2</td>\n",
" <td>2023-01-01 00:27:34</td>\n",
" <td>2023-01-01 01:05:05</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>100 rows × 3 columns</p>\n",
"</div>\n",
" <div class=\"colab-df-buttons\">\n",
"\n",
" <div class=\"colab-df-container\">\n",
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-610138c9-cd6d-4c79-bb67-3ea11f3633df')\"\n",
" title=\"Convert this dataframe to an interactive table.\"\n",
" style=\"display:none;\">\n",
"\n",
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
" <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
" </svg>\n",
" </button>\n",
"\n",
" <style>\n",
" .colab-df-container {\n",
" display:flex;\n",
" gap: 12px;\n",
" }\n",
"\n",
" .colab-df-convert {\n",
" background-color: #E8F0FE;\n",
" border: none;\n",
" border-radius: 50%;\n",
" cursor: pointer;\n",
" display: none;\n",
" fill: #1967D2;\n",
" height: 32px;\n",
" padding: 0 0 0 0;\n",
" width: 32px;\n",
" }\n",
"\n",
" .colab-df-convert:hover {\n",
" background-color: #E2EBFA;\n",
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
" fill: #174EA6;\n",
" }\n",
"\n",
" .colab-df-buttons div {\n",
" margin-bottom: 4px;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert {\n",
" background-color: #3B4455;\n",
" fill: #D2E3FC;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert:hover {\n",
" background-color: #434B5C;\n",
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
" fill: #FFFFFF;\n",
" }\n",
" </style>\n",
"\n",
" <script>\n",
" const buttonEl =\n",
" document.querySelector('#df-610138c9-cd6d-4c79-bb67-3ea11f3633df button.colab-df-convert');\n",
" buttonEl.style.display =\n",
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
"\n",
" async function convertToInteractive(key) {\n",
" const element = document.querySelector('#df-610138c9-cd6d-4c79-bb67-3ea11f3633df');\n",
" const dataTable =\n",
" await google.colab.kernel.invokeFunction('convertToInteractive',\n",
" [key], {});\n",
" if (!dataTable) return;\n",
"\n",
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
" + ' to learn more about interactive tables.';\n",
" element.innerHTML = '';\n",
" dataTable['output_type'] = 'display_data';\n",
" await google.colab.output.renderOutput(dataTable, element);\n",
" const docLink = document.createElement('div');\n",
" docLink.innerHTML = docLinkHtml;\n",
" element.appendChild(docLink);\n",
" }\n",
" </script>\n",
" </div>\n",
"\n",
"\n",
"<div id=\"df-74f1ad6f-29a7-4225-b688-9cda9e5418a7\">\n",
" <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-74f1ad6f-29a7-4225-b688-9cda9e5418a7')\"\n",
" title=\"Suggest charts\"\n",
" style=\"display:none;\">\n",
"\n",
"<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
" width=\"24px\">\n",
" <g>\n",
" <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
" </g>\n",
"</svg>\n",
" </button>\n",
"\n",
"<style>\n",
" .colab-df-quickchart {\n",
" --bg-color: #E8F0FE;\n",
" --fill-color: #1967D2;\n",
" --hover-bg-color: #E2EBFA;\n",
" --hover-fill-color: #174EA6;\n",
" --disabled-fill-color: #AAA;\n",
" --disabled-bg-color: #DDD;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-quickchart {\n",
" --bg-color: #3B4455;\n",
" --fill-color: #D2E3FC;\n",
" --hover-bg-color: #434B5C;\n",
" --hover-fill-color: #FFFFFF;\n",
" --disabled-bg-color: #3B4455;\n",
" --disabled-fill-color: #666;\n",
" }\n",
"\n",
" .colab-df-quickchart {\n",
" background-color: var(--bg-color);\n",
" border: none;\n",
" border-radius: 50%;\n",
" cursor: pointer;\n",
" display: none;\n",
" fill: var(--fill-color);\n",
" height: 32px;\n",
" padding: 0;\n",
" width: 32px;\n",
" }\n",
"\n",
" .colab-df-quickchart:hover {\n",
" background-color: var(--hover-bg-color);\n",
" box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
" fill: var(--button-hover-fill-color);\n",
" }\n",
"\n",
" .colab-df-quickchart-complete:disabled,\n",
" .colab-df-quickchart-complete:disabled:hover {\n",
" background-color: var(--disabled-bg-color);\n",
" fill: var(--disabled-fill-color);\n",
" box-shadow: none;\n",
" }\n",
"\n",
" .colab-df-spinner {\n",
" border: 2px solid var(--fill-color);\n",
" border-color: transparent;\n",
" border-bottom-color: var(--fill-color);\n",
" animation:\n",
" spin 1s steps(1) infinite;\n",
" }\n",
"\n",
" @keyframes spin {\n",
" 0% {\n",
" border-color: transparent;\n",
" border-bottom-color: var(--fill-color);\n",
" border-left-color: var(--fill-color);\n",
" }\n",
" 20% {\n",
" border-color: transparent;\n",
" border-left-color: var(--fill-color);\n",
" border-top-color: var(--fill-color);\n",
" }\n",
" 30% {\n",
" border-color: transparent;\n",
" border-left-color: var(--fill-color);\n",
" border-top-color: var(--fill-color);\n",
" border-right-color: var(--fill-color);\n",
" }\n",
" 40% {\n",
" border-color: transparent;\n",
" border-right-color: var(--fill-color);\n",
" border-top-color: var(--fill-color);\n",
" }\n",
" 60% {\n",
" border-color: transparent;\n",
" border-right-color: var(--fill-color);\n",
" }\n",
" 80% {\n",
" border-color: transparent;\n",
" border-right-color: var(--fill-color);\n",
" border-bottom-color: var(--fill-color);\n",
" }\n",
" 90% {\n",
" border-color: transparent;\n",
" border-bottom-color: var(--fill-color);\n",
" }\n",
" }\n",
"</style>\n",
"\n",
" <script>\n",
" async function quickchart(key) {\n",
" const quickchartButtonEl =\n",
" document.querySelector('#' + key + ' button');\n",
" quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n",
" quickchartButtonEl.classList.add('colab-df-spinner');\n",
" try {\n",
" const charts = await google.colab.kernel.invokeFunction(\n",
" 'suggestCharts', [key], {});\n",
" } catch (error) {\n",
" console.error('Error during call to suggestCharts:', error);\n",
" }\n",
" quickchartButtonEl.classList.remove('colab-df-spinner');\n",
" quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
" }\n",
" (() => {\n",
" let quickchartButtonEl =\n",
" document.querySelector('#df-74f1ad6f-29a7-4225-b688-9cda9e5418a7 button');\n",
" quickchartButtonEl.style.display =\n",
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
" })();\n",
" </script>\n",
"</div>\n",
"\n",
" <div id=\"id_eb84e669-b39a-46a4-9e56-7196f5fccf21\">\n",
" <style>\n",
" .colab-df-generate {\n",
" background-color: #E8F0FE;\n",
" border: none;\n",
" border-radius: 50%;\n",
" cursor: pointer;\n",
" display: none;\n",
" fill: #1967D2;\n",
" height: 32px;\n",
" padding: 0 0 0 0;\n",
" width: 32px;\n",
" }\n",
"\n",
" .colab-df-generate:hover {\n",
" background-color: #E2EBFA;\n",
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
" fill: #174EA6;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-generate {\n",
" background-color: #3B4455;\n",
" fill: #D2E3FC;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-generate:hover {\n",
" background-color: #434B5C;\n",
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
" fill: #FFFFFF;\n",
" }\n",
" </style>\n",
" <button class=\"colab-df-generate\" onclick=\"generateWithVariable('scan_df')\"\n",
" title=\"Generate code using this dataframe.\"\n",
" style=\"display:none;\">\n",
"\n",
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
" width=\"24px\">\n",
" <path d=\"M7,19H8.4L18.45,9,17,7.55,7,17.6ZM5,21V16.75L18.45,3.32a2,2,0,0,1,2.83,0l1.4,1.43a1.91,1.91,0,0,1,.58,1.4,1.91,1.91,0,0,1-.58,1.4L9.25,21ZM18.45,9,17,7.55Zm-12,3A5.31,5.31,0,0,0,4.9,8.1,5.31,5.31,0,0,0,1,6.5,5.31,5.31,0,0,0,4.9,4.9,5.31,5.31,0,0,0,6.5,1,5.31,5.31,0,0,0,8.1,4.9,5.31,5.31,0,0,0,12,6.5,5.46,5.46,0,0,0,6.5,12Z\"/>\n",
" </svg>\n",
" </button>\n",
" <script>\n",
" (() => {\n",
" const buttonEl =\n",
" document.querySelector('#id_eb84e669-b39a-46a4-9e56-7196f5fccf21 button.colab-df-generate');\n",
" buttonEl.style.display =\n",
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
"\n",
" buttonEl.onclick = () => {\n",
" google.colab.notebook.generateWithVariable('scan_df');\n",
" }\n",
" })();\n",
" </script>\n",
" </div>\n",
"\n",
" </div>\n",
" </div>\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "scan_df",
"summary": "{\n \"name\": \"scan_df\",\n \"rows\": 100,\n \"fields\": [\n {\n \"column\": \"VendorID\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 2,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 2\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tpep_pickup_datetime\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": \"2023-01-01 00:01:55\",\n \"max\": \"2023-01-01 23:19:19\",\n \"num_unique_values\": 99,\n \"samples\": [\n \"2023-01-01 00:50:48\",\n \"2023-01-01 00:13:55\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tpep_dropoff_datetime\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": \"2023-01-01 00:27:47\",\n \"max\": \"2023-01-02 00:11:52\",\n \"num_unique_values\": 99,\n \"samples\": [\n \"2023-01-01 00:40:09\",\n \"2023-01-01 00:35:19\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 43
}
]
},
{
"cell_type": "markdown",
"source": [
"# Scan into DuckDB\n",
"query the scanned rows using duckdb"
],
"metadata": {
"id": "SyumqklHfa9b"
}
},
{
"cell_type": "code",
"source": [
"scan_duck = scan.to_duckdb(table_name=\"distant_taxi_trips\") # scan into duckdb\n",
"scan_duck.sql(\"select * from distant_taxi_trips\")\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "k79Ethzjfg5A",
"outputId": "e6ed444f-9265-4bec-b31e-2977cdbe405c"
},
"execution_count": 49,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"┌──────────┬──────────────────────┬───────────────────────┐\n",
"│ VendorID │ tpep_pickup_datetime │ tpep_dropoff_datetime │\n",
"│ int64 │ timestamp │ timestamp │\n",
"├──────────┼──────────────────────┼───────────────────────┤\n",
"│ 2 │ 2023-01-01 00:27:12 │ 2023-01-01 00:49:56 │\n",
"│ 2 │ 2023-01-01 00:09:29 │ 2023-01-01 00:29:23 │\n",
"│ 1 │ 2023-01-01 00:13:30 │ 2023-01-01 00:44:00 │\n",
"│ 2 │ 2023-01-01 00:41:41 │ 2023-01-01 01:19:32 │\n",
"│ 2 │ 2023-01-01 00:22:39 │ 2023-01-01 01:30:45 │\n",
"│ 1 │ 2023-01-01 00:18:29 │ 2023-01-01 00:55:20 │\n",
"│ 2 │ 2023-01-01 00:25:14 │ 2023-01-01 01:00:29 │\n",
"│ 2 │ 2023-01-01 00:37:44 │ 2023-01-01 01:16:14 │\n",
"│ 2 │ 2023-01-01 00:49:24 │ 2023-01-01 01:30:59 │\n",
"│ 1 │ 2023-01-01 00:21:15 │ 2023-01-01 00:52:25 │\n",
"│ · │ · │ · │\n",
"│ · │ · │ · │\n",
"│ · │ · │ · │\n",
"│ 2 │ 2023-01-01 00:12:07 │ 2023-01-01 00:44:26 │\n",
"│ 2 │ 2023-01-01 00:47:21 │ 2023-01-01 01:30:32 │\n",
"│ 2 │ 2023-01-01 00:47:39 │ 2023-01-01 01:23:27 │\n",
"│ 1 │ 2023-01-01 00:56:13 │ 2023-01-01 01:20:11 │\n",
"│ 2 │ 2023-01-01 00:05:24 │ 2023-01-01 00:33:50 │\n",
"│ 2 │ 2023-01-01 00:56:24 │ 2023-01-01 01:26:29 │\n",
"│ 2 │ 2023-01-01 00:55:38 │ 2023-01-01 01:25:34 │\n",
"│ 2 │ 2023-01-01 00:13:36 │ 2023-01-01 00:48:23 │\n",
"│ 2 │ 2023-01-01 00:51:18 │ 2023-01-01 01:11:18 │\n",
"│ 2 │ 2023-01-01 00:27:34 │ 2023-01-01 01:05:05 │\n",
"├──────────┴──────────────────────┴───────────────────────┤\n",
"│ 100 rows (20 shown) 3 columns │\n",
"└─────────────────────────────────────────────────────────┘"
]
},
"metadata": {},
"execution_count": 49
}
]
},
{
"cell_type": "markdown",
"source": [
"# Query Iceberg Table\n",
"Use duckdb and the Iceberg extension to query iceberg"
],
"metadata": {
"id": "GpBp6vtB61v5"
}
},
{
"cell_type": "code",
"source": [
"duckdb.sql(\"INSTALL iceberg;\")"
],
"metadata": {
"id": "WHTn0dl3F_6W",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 49,
"referenced_widgets": [
"a3feb983d94d4b6b9a6900e994387d90",
"19c3c4eae073475188884cc8421ac4e4",
"832a49bbebdc49ceaba1de536c218566"
]
},
"outputId": "7dd1bac4-d683-47bb-b37b-ebf40875b758"
},
"execution_count": 50,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "a3feb983d94d4b6b9a6900e994387d90"
}
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"source": [
"duckdb.sql(\"LOAD ICEBERG;\")"
],
"metadata": {
"id": "FS7JJjQfFxJw"
},
"execution_count": 51,
"outputs": []
},
{
"cell_type": "code",
"source": [
"query=f\"\"\"\n",
" SELECT *\n",
" FROM iceberg_scan('{warehouse_path}/{namespace}.db/{table_name}')\n",
"\"\"\"\n",
"query"
],
"metadata": {
"id": "0lqme_aYQMXm",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
},
"outputId": "3b67ea09-6449-4981-8358-2e11030c8077"
},
"execution_count": 52,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"\"\\n SELECT *\\n FROM iceberg_scan('/tmp/warehouse/taxi_namespace.db/taxi_trips')\\n\""
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
}
},
"metadata": {},
"execution_count": 52
}
]
},
{
"cell_type": "code",
"source": [
"\n",
"# Query the Iceberg table\n",
"result = duckdb.sql(query)"
],
"metadata": {
"id": "x49btKTdK8hB",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 287
},
"outputId": "670cd7cc-7150-4bc8-ba54-de16ead5e45f"
},
"execution_count": 53,
"outputs": [
{
"output_type": "error",
"ename": "IOException",
"evalue": "IO Error: Cannot open file \"/tmp/warehouse/taxi_namespace.db/taxi_trips/metadata/version-hint.text\": No such file or directory",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mIOException\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-53-4ef425e70b2c>\u001b[0m in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Query the Iceberg table\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mduckdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msql\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/duckdb/__init__.py\u001b[0m in \u001b[0;36msql\u001b[0;34m(query, **kwargs)\u001b[0m\n\u001b[1;32m 455\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 456\u001b[0m \u001b[0mconn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mduckdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\":default:\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 457\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msql\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 458\u001b[0m \u001b[0m_exported_symbols\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'sql'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 459\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mIOException\u001b[0m: IO Error: Cannot open file \"/tmp/warehouse/taxi_namespace.db/taxi_trips/metadata/version-hint.text\": No such file or directory"
]
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "9mm28__GGafk"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Detach the existing \"sqlite_db\" if it exists\n",
"duckdb.execute(\"DETACH DATABASE IF EXISTS sqlite_db;\")\n",
"\n",
"# Attach the SQLite database to DuckDB\n",
"duckdb.execute(\"INSTALL sqlite_scanner;\")\n",
"duckdb.execute(\"LOAD sqlite_scanner;\")\n",
"duckdb.execute(\"ATTACH DATABASE '/tmp/warehouse/pyiceberg_catalog.db' AS sqlite_db;\")\n",
"\n"
],
"metadata": {
"id": "C3JTQUQIGlvm"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"duckdb.sql(\"select * from main.sqlite_master\")"
],
"metadata": {
"id": "Oir46FBqQl_O"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "yzUrhj0v0pV6"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!ls /tmp/"
],
"metadata": {
"id": "dkfCydJOG1Jz"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!ls /tmp/warehouse/"
],
"metadata": {
"id": "HmP2N2vbG5JQ"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!cat /tmp/warehouse/pyiceberg_catalog.db"
],
"metadata": {
"id": "16B7kStLG9P6"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!ls /tmp/warehouse/taxi_namespace.db"
],
"metadata": {
"id": "nD8wz38TII7E"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!ls /tmp/warehouse/taxi_namespace.db/taxi_trips"
],
"metadata": {
"id": "OwXHQg9MILdW"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!ls /tmp/warehouse/taxi_namespace.db/taxi_trips/metadata"
],
"metadata": {
"id": "_B9FlmDeIP3o"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [],
"metadata": {
"id": "j7Jj9-4na9Ld"
}
},
{
"cell_type": "code",
"source": [
"!pip freeze"
],
"metadata": {
"id": "pl9ylWoRJSue"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "oWO4asQXJTDG"
},
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment