Created
July 24, 2024 02:17
-
-
Save lamb-russell/4f42891c55425ec29fb23593eecbd5cd to your computer and use it in GitHub Desktop.
DuckDb and PyIceberg example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [] | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
}, | |
"widgets": { | |
"application/vnd.jupyter.widget-state+json": { | |
"1aba39ae35bc4fb2b88adc346e18568a": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "FloatProgressModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_dom_classes": [], | |
"_model_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_model_name": "FloatProgressModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/controls", | |
"_view_module_version": "1.5.0", | |
"_view_name": "ProgressView", | |
"bar_style": "", | |
"description": "", | |
"description_tooltip": null, | |
"layout": "IPY_MODEL_a8f093b1c6814577a24d56a2bee85a2d", | |
"max": 100, | |
"min": 0, | |
"orientation": "horizontal", | |
"style": "IPY_MODEL_f1670a9277de47b2a371d87bfdd2d2ca", | |
"value": 100 | |
} | |
}, | |
"a8f093b1c6814577a24d56a2bee85a2d": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_model_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.2.0", | |
"_model_name": "LayoutModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/base", | |
"_view_module_version": "1.2.0", | |
"_view_name": "LayoutView", | |
"align_content": null, | |
"align_items": null, | |
"align_self": null, | |
"border": null, | |
"bottom": null, | |
"display": null, | |
"flex": null, | |
"flex_flow": null, | |
"grid_area": null, | |
"grid_auto_columns": null, | |
"grid_auto_flow": null, | |
"grid_auto_rows": null, | |
"grid_column": null, | |
"grid_gap": null, | |
"grid_row": null, | |
"grid_template_areas": null, | |
"grid_template_columns": null, | |
"grid_template_rows": null, | |
"height": null, | |
"justify_content": null, | |
"justify_items": null, | |
"left": null, | |
"margin": null, | |
"max_height": null, | |
"max_width": null, | |
"min_height": null, | |
"min_width": null, | |
"object_fit": null, | |
"object_position": null, | |
"order": null, | |
"overflow": null, | |
"overflow_x": null, | |
"overflow_y": null, | |
"padding": null, | |
"right": null, | |
"top": null, | |
"visibility": null, | |
"width": "auto" | |
} | |
}, | |
"f1670a9277de47b2a371d87bfdd2d2ca": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "ProgressStyleModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_model_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_model_name": "ProgressStyleModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/base", | |
"_view_module_version": "1.2.0", | |
"_view_name": "StyleView", | |
"bar_color": "black", | |
"description_width": "" | |
} | |
}, | |
"a3feb983d94d4b6b9a6900e994387d90": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "FloatProgressModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_dom_classes": [], | |
"_model_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_model_name": "FloatProgressModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/controls", | |
"_view_module_version": "1.5.0", | |
"_view_name": "ProgressView", | |
"bar_style": "", | |
"description": "", | |
"description_tooltip": null, | |
"layout": "IPY_MODEL_19c3c4eae073475188884cc8421ac4e4", | |
"max": 100, | |
"min": 0, | |
"orientation": "horizontal", | |
"style": "IPY_MODEL_832a49bbebdc49ceaba1de536c218566", | |
"value": 100 | |
} | |
}, | |
"19c3c4eae073475188884cc8421ac4e4": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_model_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.2.0", | |
"_model_name": "LayoutModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/base", | |
"_view_module_version": "1.2.0", | |
"_view_name": "LayoutView", | |
"align_content": null, | |
"align_items": null, | |
"align_self": null, | |
"border": null, | |
"bottom": null, | |
"display": null, | |
"flex": null, | |
"flex_flow": null, | |
"grid_area": null, | |
"grid_auto_columns": null, | |
"grid_auto_flow": null, | |
"grid_auto_rows": null, | |
"grid_column": null, | |
"grid_gap": null, | |
"grid_row": null, | |
"grid_template_areas": null, | |
"grid_template_columns": null, | |
"grid_template_rows": null, | |
"height": null, | |
"justify_content": null, | |
"justify_items": null, | |
"left": null, | |
"margin": null, | |
"max_height": null, | |
"max_width": null, | |
"min_height": null, | |
"min_width": null, | |
"object_fit": null, | |
"object_position": null, | |
"order": null, | |
"overflow": null, | |
"overflow_x": null, | |
"overflow_y": null, | |
"padding": null, | |
"right": null, | |
"top": null, | |
"visibility": null, | |
"width": "auto" | |
} | |
}, | |
"832a49bbebdc49ceaba1de536c218566": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "ProgressStyleModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_model_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_model_name": "ProgressStyleModel", | |
"_view_count": null, | |
"_view_module": "@jupyter-widgets/base", | |
"_view_module_version": "1.2.0", | |
"_view_name": "StyleView", | |
"bar_color": "black", | |
"description_width": "" | |
} | |
} | |
} | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"id": "NTf__OFI2lC2" | |
}, | |
"outputs": [], | |
"source": [ | |
"!pip install --quiet duckdb" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!pip install --quiet pyiceberg" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "W9e4XgUBwUuo", | |
"outputId": "29d6f42f-3ed6-4eda-aa1a-9a2b9eee0668" | |
}, | |
"execution_count": 2, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m969.9/969.9 kB\u001b[0m \u001b[31m4.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m123.9/123.9 kB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
"\u001b[?25h" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!pip install sqlalchemy" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "TlRyIpJdxOt1", | |
"outputId": "23ee8a73-9ec1-41ee-d078-9b88e8bbabae" | |
}, | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Requirement already satisfied: sqlalchemy in /usr/local/lib/python3.10/dist-packages (2.0.31)\n", | |
"Requirement already satisfied: typing-extensions>=4.6.0 in /usr/local/lib/python3.10/dist-packages (from sqlalchemy) (4.12.2)\n", | |
"Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from sqlalchemy) (3.0.3)\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import logging\n", | |
"logging.basicConfig(level=logging.DEBUG)" | |
], | |
"metadata": { | |
"id": "25ED4iUnMbqf" | |
}, | |
"execution_count": 4, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import duckdb\n", | |
"\n", | |
"duckdb.sql(\"SELECT 1\").df\n" | |
], | |
"metadata": { | |
"id": "jgn93mtb3AS1", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "6f3a48be-7690-44c5-d7d2-cebf16fa0291" | |
}, | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<bound method PyCapsule.df of ┌───────┐\n", | |
"│ 1 │\n", | |
"│ int32 │\n", | |
"├───────┤\n", | |
"│ 1 │\n", | |
"└───────┘\n", | |
">" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 5 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [], | |
"metadata": { | |
"id": "GNDv5_GnEqVo" | |
}, | |
"execution_count": 5, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Create an Iceberg Catalog\n", | |
"Use sqlalchemy to create a sqlite catalog for iceberg. Create a table in iceberg and load the taxi dataset." | |
], | |
"metadata": { | |
"id": "nCGwS0I8K6Ko" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!mkdir \"/tmp/warehouse\" # make directory for catalog SqlLite file" | |
], | |
"metadata": { | |
"id": "vydzcwjiFOAY" | |
}, | |
"execution_count": 6, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from pyiceberg.catalog.sql import SqlCatalog\n", | |
"\n", | |
"warehouse_path = \"/tmp/warehouse\"\n", | |
"catalog = SqlCatalog(\n", | |
" \"default\",\n", | |
" **{\n", | |
" \"uri\": f\"sqlite:///{warehouse_path}/pyiceberg_catalog.db\",\n", | |
" \"warehouse\": f\"file://{warehouse_path}\",\n", | |
" },\n", | |
")" | |
], | |
"metadata": { | |
"id": "HHJdYFAgFTof" | |
}, | |
"execution_count": 7, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Load Data into a Table\n", | |
"Load taxi data into a iceberg table" | |
], | |
"metadata": { | |
"id": "r0XFuTkx6rcV" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"arrow_df = duckdb.sql(\"select * from read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')\").arrow()" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 49, | |
"referenced_widgets": [ | |
"1aba39ae35bc4fb2b88adc346e18568a", | |
"a8f093b1c6814577a24d56a2bee85a2d", | |
"f1670a9277de47b2a371d87bfdd2d2ca" | |
] | |
}, | |
"id": "aLwhWCs0GAuH", | |
"outputId": "bb575d4f-dfc8-4171-b71e-f8cabfe28d6a" | |
}, | |
"execution_count": 8, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/plain": [ | |
"FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" | |
], | |
"application/vnd.jupyter.widget-view+json": { | |
"version_major": 2, | |
"version_minor": 0, | |
"model_id": "1aba39ae35bc4fb2b88adc346e18568a" | |
} | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"schema = arrow_df.schema\n", | |
"schema" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "97RLi8PIGIMB", | |
"outputId": "296a6798-d337-43f3-9bc3-e0a9eece40ae" | |
}, | |
"execution_count": 9, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"VendorID: int64\n", | |
"tpep_pickup_datetime: timestamp[us]\n", | |
"tpep_dropoff_datetime: timestamp[us]\n", | |
"passenger_count: double\n", | |
"trip_distance: double\n", | |
"RatecodeID: double\n", | |
"store_and_fwd_flag: string\n", | |
"PULocationID: int64\n", | |
"DOLocationID: int64\n", | |
"payment_type: int64\n", | |
"fare_amount: double\n", | |
"extra: double\n", | |
"mta_tax: double\n", | |
"tip_amount: double\n", | |
"tolls_amount: double\n", | |
"improvement_surcharge: double\n", | |
"total_amount: double\n", | |
"congestion_surcharge: double\n", | |
"airport_fee: double" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 9 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# create the iceberg table\n", | |
"\n", | |
"from pyiceberg.exceptions import NoSuchNamespaceError, NamespaceAlreadyExistsError, TableAlreadyExistsError\n", | |
"\n", | |
"# Define the table name and namespace\n", | |
"namespace = \"taxi_namespace\"\n", | |
"table_name = \"taxi_trips\"\n", | |
"\n", | |
"# Create the namespace if it doesn't exist\n", | |
"try:\n", | |
" catalog.create_namespace(namespace)\n", | |
"except NamespaceAlreadyExistsError:\n", | |
" print(\"Namespace already exists.\")\n", | |
" pass # Namespace already exists\n", | |
"\n", | |
"# Create or load the Iceberg table\n", | |
"try:\n", | |
" table = catalog.create_table(f\"{namespace}.{table_name}\", schema)\n", | |
"except TableAlreadyExistsError:\n", | |
" table = catalog.load_table(f\"{namespace}.{table_name}\")\n", | |
" print(\"Iceberg table already exists.\")" | |
], | |
"metadata": { | |
"id": "TW-4s9WlFgOM" | |
}, | |
"execution_count": 10, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"table.overwrite(arrow_df) # the data frame to the iceberg dataset" | |
], | |
"metadata": { | |
"id": "_zra0MQUv2d5" | |
}, | |
"execution_count": 11, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"len(table.scan().to_arrow())\n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "grT3hGJVKIOf", | |
"outputId": "08a05884-6c1a-4337-e0c3-d307e936175c" | |
}, | |
"execution_count": 12, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"3066766" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 12 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"df=table.scan().to_pandas()\n" | |
], | |
"metadata": { | |
"id": "zb--g4VtKQJi" | |
}, | |
"execution_count": 13, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [], | |
"metadata": { | |
"id": "1mb4GLeuLdtn" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"r=duckdb.register(\"duck_test\",df)\n" | |
], | |
"metadata": { | |
"id": "w0qsEHTILWJn" | |
}, | |
"execution_count": 14, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"duckdb.sql(\"select * from duck_test\")" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "A_d_VPJJKdds", | |
"outputId": "5c9b2533-ff3a-4b9f-b828-1841d4992c38" | |
}, | |
"execution_count": 15, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"┌──────────┬──────────────────────┬──────────────────────┬───┬──────────────┬──────────────────────┬─────────────┐\n", | |
"│ VendorID │ tpep_pickup_datetime │ tpep_dropoff_datet… │ … │ total_amount │ congestion_surcharge │ airport_fee │\n", | |
"│ int64 │ timestamp │ timestamp │ │ double │ double │ double │\n", | |
"├──────────┼──────────────────────┼──────────────────────┼───┼──────────────┼──────────────────────┼─────────────┤\n", | |
"│ 2 │ 2023-01-01 00:32:10 │ 2023-01-01 00:40:36 │ … │ 14.3 │ 2.5 │ 0.0 │\n", | |
"│ 2 │ 2023-01-01 00:55:08 │ 2023-01-01 01:01:27 │ … │ 16.9 │ 2.5 │ 0.0 │\n", | |
"│ 2 │ 2023-01-01 00:25:04 │ 2023-01-01 00:37:49 │ … │ 34.9 │ 2.5 │ 0.0 │\n", | |
"│ 1 │ 2023-01-01 00:03:48 │ 2023-01-01 00:13:25 │ … │ 20.85 │ 0.0 │ 1.25 │\n", | |
"│ 2 │ 2023-01-01 00:10:29 │ 2023-01-01 00:21:19 │ … │ 19.68 │ 2.5 │ 0.0 │\n", | |
"│ 2 │ 2023-01-01 00:50:34 │ 2023-01-01 01:02:52 │ … │ 27.8 │ 2.5 │ 0.0 │\n", | |
"│ 2 │ 2023-01-01 00:09:22 │ 2023-01-01 00:19:49 │ … │ 20.52 │ 2.5 │ 0.0 │\n", | |
"│ 2 │ 2023-01-01 00:27:12 │ 2023-01-01 00:49:56 │ … │ 64.44 │ 2.5 │ 0.0 │\n", | |
"│ 2 │ 2023-01-01 00:21:44 │ 2023-01-01 00:36:40 │ … │ 28.38 │ 2.5 │ 0.0 │\n", | |
"│ 2 │ 2023-01-01 00:39:42 │ 2023-01-01 00:50:36 │ … │ 19.9 │ 2.5 │ 0.0 │\n", | |
"│ · │ · │ · │ · │ · │ · │ · │\n", | |
"│ · │ · │ · │ · │ · │ · │ · │\n", | |
"│ · │ · │ · │ · │ · │ · │ · │\n", | |
"│ 2 │ 2023-01-01 01:28:24 │ 2023-01-01 01:56:38 │ … │ 42.3 │ 2.5 │ 0.0 │\n", | |
"│ 2 │ 2023-01-01 01:25:30 │ 2023-01-01 01:46:33 │ … │ 35.6 │ 2.5 │ 0.0 │\n", | |
"│ 2 │ 2023-01-01 01:49:47 │ 2023-01-01 02:01:27 │ … │ 26.63 │ 2.5 │ 0.0 │\n", | |
"│ 2 │ 2023-01-01 01:13:11 │ 2023-01-01 01:16:31 │ … │ 14.04 │ 2.5 │ 0.0 │\n", | |
"│ 2 │ 2023-01-01 01:22:22 │ 2023-01-01 01:33:19 │ … │ 21.36 │ 2.5 │ 0.0 │\n", | |
"│ 2 │ 2023-01-01 01:37:42 │ 2023-01-01 01:49:29 │ … │ 17.8 │ 2.5 │ 0.0 │\n", | |
"│ 2 │ 2023-01-01 01:51:17 │ 2023-01-01 02:02:07 │ … │ 22.9 │ 2.5 │ 0.0 │\n", | |
"│ 2 │ 2023-01-01 01:49:24 │ 2023-01-01 02:17:26 │ … │ 33.2 │ 2.5 │ 0.0 │\n", | |
"│ 1 │ 2023-01-01 01:16:50 │ 2023-01-01 01:23:29 │ … │ 10.4 │ 0.0 │ 0.0 │\n", | |
"│ 1 │ 2023-01-01 01:26:48 │ 2023-01-01 01:35:56 │ … │ 13.9 │ 0.0 │ 0.0 │\n", | |
"├──────────┴──────────────────────┴──────────────────────┴───┴──────────────┴──────────────────────┴─────────────┤\n", | |
"│ ? rows (>9999 rows, 20 shown) 19 columns (6 shown) │\n", | |
"└────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 15 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"duckdb.sql(\"SELECT * FROM information_schema.tables ;\")" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "20EPIneOLE04", | |
"outputId": "e3dca695-2110-48a2-8825-7de6e0083157" | |
}, | |
"execution_count": 16, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"┌───────────────┬──────────────┬────────────┬───┬────────────────────┬──────────┬───────────────┬───────────────┐\n", | |
"│ table_catalog │ table_schema │ table_name │ … │ is_insertable_into │ is_typed │ commit_action │ TABLE_COMMENT │\n", | |
"│ varchar │ varchar │ varchar │ │ varchar │ varchar │ varchar │ varchar │\n", | |
"├───────────────┼──────────────┼────────────┼───┼────────────────────┼──────────┼───────────────┼───────────────┤\n", | |
"│ temp │ main │ duck_test │ … │ NO │ NO │ NULL │ NULL │\n", | |
"├───────────────┴──────────────┴────────────┴───┴────────────────────┴──────────┴───────────────┴───────────────┤\n", | |
"│ 1 rows 13 columns (7 shown) │\n", | |
"└───────────────────────────────────────────────────────────────────────────────────────────────────────────────┘" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 16 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Use PyIceberg CLI to read\n", | |
"Use the pyiceberg cli to read describe the newly created table" | |
], | |
"metadata": { | |
"id": "mSlEj5GSP22M" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!pyiceberg --help" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "toT0G8eQM6UX", | |
"outputId": "eb7c9566-a020-4789-ca39-0ad850a106a1" | |
}, | |
"execution_count": 17, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Usage: pyiceberg [OPTIONS] COMMAND [ARGS]...\n", | |
"\n", | |
"Options:\n", | |
" --catalog TEXT\n", | |
" --verbose BOOLEAN\n", | |
" --output [text|json]\n", | |
" --uri TEXT\n", | |
" --credential TEXT\n", | |
" --help Show this message and exit.\n", | |
"\n", | |
"Commands:\n", | |
" create Operation to create a namespace.\n", | |
" describe Describe a namespace or a table.\n", | |
" drop Operations to drop a namespace or table.\n", | |
" files List all the files of the table.\n", | |
" list List tables or namespaces.\n", | |
" list-refs List all the refs in the provided table.\n", | |
" location Return the location of the table.\n", | |
" properties Properties on tables/namespaces.\n", | |
" rename Rename a table.\n", | |
" schema Get the schema of the table.\n", | |
" spec Return the partition spec of the table.\n", | |
" uuid Return the UUID of the table.\n", | |
" version Print pyiceberg version.\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!pyiceberg --catalog default --uri sqlite:////tmp/warehouse/pyiceberg_catalog.db list" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "xCVzpKzZM_Vp", | |
"outputId": "a29cf4e9-6c5d-485b-e620-c0ed579c8646" | |
}, | |
"execution_count": 18, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"taxi_namespace\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!pyiceberg --catalog default --uri sqlite:////tmp/warehouse/pyiceberg_catalog.db describe taxi_namespace.taxi_trips" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "xx4Vb1hINy9M", | |
"outputId": "faa5e585-cceb-4999-d843-9aa14e623d45" | |
}, | |
"execution_count": 19, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Table format version 2 \n", | |
"Metadata location file:///tmp/warehouse/taxi_namespace.db/taxi_trips/metadata/00001-86911ea9-b1…\n", | |
"Table UUID 14d1f1e3-c7d5-4099-ba01-db72e6d43773 \n", | |
"Last Updated 1721786211755 \n", | |
"Partition spec [] \n", | |
"Sort order [] \n", | |
"Current schema Schema, id=0 \n", | |
" ├── 1: VendorID: optional long \n", | |
" ├── 2: tpep_pickup_datetime: optional timestamp \n", | |
" ├── 3: tpep_dropoff_datetime: optional timestamp \n", | |
" ├── 4: passenger_count: optional double \n", | |
" ├── 5: trip_distance: optional double \n", | |
" ├── 6: RatecodeID: optional double \n", | |
" ├── 7: store_and_fwd_flag: optional string \n", | |
" ├── 8: PULocationID: optional long \n", | |
" ├── 9: DOLocationID: optional long \n", | |
" ├── 10: payment_type: optional long \n", | |
" ├── 11: fare_amount: optional double \n", | |
" ├── 12: extra: optional double \n", | |
" ├── 13: mta_tax: optional double \n", | |
" ├── 14: tip_amount: optional double \n", | |
" ├── 15: tolls_amount: optional double \n", | |
" ├── 16: improvement_surcharge: optional double \n", | |
" ├── 17: total_amount: optional double \n", | |
" ├── 18: congestion_surcharge: optional double \n", | |
" └── 19: airport_fee: optional double \n", | |
"Current snapshot Operation.APPEND: id=5984614148918088538, schema_id=0 \n", | |
"Snapshots Snapshots \n", | |
" └── Snapshot 5984614148918088538, schema 0: \n", | |
" file:///tmp/warehouse/taxi_namespace.db/taxi_trips/metadata/snap-59846141…\n", | |
"Properties \n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [], | |
"metadata": { | |
"id": "5LjvcWdWQQ5B" | |
}, | |
"execution_count": 19, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Read from the catalog using PyIceberg" | |
], | |
"metadata": { | |
"id": "RQRyWG3Ddnjm" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from pyiceberg.catalog import load_catalog\n", | |
"\n", | |
"read_catalog=load_catalog(\n", | |
" \"default\",\n", | |
" **{\n", | |
" \"uri\": \"sqlite:////tmp/warehouse/pyiceberg_catalog.db\",\n", | |
" \"py-io-impl\": \"pyiceberg.io.pyarrow.PyArrowFileIO\",\n", | |
"\n", | |
" }\n", | |
")\n", | |
"\n", | |
"read_catalog.list_namespaces() # list all namespaces" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "hnz1aN_PQDZk", | |
"outputId": "62a1cb9b-a70b-4816-c3a9-611508d8ce87" | |
}, | |
"execution_count": 24, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"[('taxi_namespace',)]" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 24 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"read_catalog.list_tables(\"taxi_namespace\") # list all tables in the namespace" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "Mmwx7mvndr22", | |
"outputId": "60a00c90-82ce-46d4-806b-8d62258390ea" | |
}, | |
"execution_count": 25, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"[('taxi_namespace', 'taxi_trips')]" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 25 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"\n", | |
"read_table = read_catalog.load_table(\"taxi_namespace.taxi_trips\")\n", | |
"read_table" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "yhg4yI6kd5fD", | |
"outputId": "4d52f8ff-c7ff-4283-b179-b704bdc3852b" | |
}, | |
"execution_count": 30, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"taxi_trips(\n", | |
" 1: VendorID: optional long,\n", | |
" 2: tpep_pickup_datetime: optional timestamp,\n", | |
" 3: tpep_dropoff_datetime: optional timestamp,\n", | |
" 4: passenger_count: optional double,\n", | |
" 5: trip_distance: optional double,\n", | |
" 6: RatecodeID: optional double,\n", | |
" 7: store_and_fwd_flag: optional string,\n", | |
" 8: PULocationID: optional long,\n", | |
" 9: DOLocationID: optional long,\n", | |
" 10: payment_type: optional long,\n", | |
" 11: fare_amount: optional double,\n", | |
" 12: extra: optional double,\n", | |
" 13: mta_tax: optional double,\n", | |
" 14: tip_amount: optional double,\n", | |
" 15: tolls_amount: optional double,\n", | |
" 16: improvement_surcharge: optional double,\n", | |
" 17: total_amount: optional double,\n", | |
" 18: congestion_surcharge: optional double,\n", | |
" 19: airport_fee: optional double\n", | |
"),\n", | |
"partition by: [],\n", | |
"sort order: [],\n", | |
"snapshot: Operation.APPEND: id=5984614148918088538, schema_id=0" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 30 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Scan able to Pandas\n", | |
"Read data from the parquet files to a pandas data frame." | |
], | |
"metadata": { | |
"id": "TypJdWbxe5kg" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from pyiceberg.expressions import GreaterThanOrEqual\n", | |
"scan = table.scan(\n", | |
" row_filter=GreaterThanOrEqual(\"trip_distance\", 10.0),\n", | |
" selected_fields=(\"VendorID\", \"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"),\n", | |
" limit=100,\n", | |
")\n", | |
"\n", | |
"[task.file.file_path for task in scan.plan_files()] # show the file that contains the rows\n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "fd5h-Gl7d-ti", | |
"outputId": "244a0df3-0d4f-4315-b19c-bd8eb8c337e8" | |
}, | |
"execution_count": 42, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"<pyiceberg.table.DataScan object at 0x7b94e2cb2200>\n" | |
] | |
}, | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"['file:///tmp/warehouse/taxi_namespace.db/taxi_trips/data/00000-0-4d92790a-a45b-4e41-807e-0299bbe7d87d.parquet']" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 42 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"\n", | |
"scan_df = scan.to_pandas() # read the data to a pandas data frame\n", | |
"scan_df" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 424 | |
}, | |
"id": "9Q30vYlNfTvS", | |
"outputId": "9ff898f1-5728-454d-9bc1-1c47904462b0" | |
}, | |
"execution_count": 43, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
" VendorID tpep_pickup_datetime tpep_dropoff_datetime\n", | |
"0 2 2023-01-01 00:27:12 2023-01-01 00:49:56\n", | |
"1 2 2023-01-01 00:09:29 2023-01-01 00:29:23\n", | |
"2 1 2023-01-01 00:13:30 2023-01-01 00:44:00\n", | |
"3 2 2023-01-01 00:41:41 2023-01-01 01:19:32\n", | |
"4 2 2023-01-01 00:22:39 2023-01-01 01:30:45\n", | |
".. ... ... ...\n", | |
"95 2 2023-01-01 00:56:24 2023-01-01 01:26:29\n", | |
"96 2 2023-01-01 00:55:38 2023-01-01 01:25:34\n", | |
"97 2 2023-01-01 00:13:36 2023-01-01 00:48:23\n", | |
"98 2 2023-01-01 00:51:18 2023-01-01 01:11:18\n", | |
"99 2 2023-01-01 00:27:34 2023-01-01 01:05:05\n", | |
"\n", | |
"[100 rows x 3 columns]" | |
], | |
"text/html": [ | |
"\n", | |
" <div id=\"df-610138c9-cd6d-4c79-bb67-3ea11f3633df\" class=\"colab-df-container\">\n", | |
" <div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>VendorID</th>\n", | |
" <th>tpep_pickup_datetime</th>\n", | |
" <th>tpep_dropoff_datetime</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>2</td>\n", | |
" <td>2023-01-01 00:27:12</td>\n", | |
" <td>2023-01-01 00:49:56</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>2023-01-01 00:09:29</td>\n", | |
" <td>2023-01-01 00:29:23</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>2023-01-01 00:13:30</td>\n", | |
" <td>2023-01-01 00:44:00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2</td>\n", | |
" <td>2023-01-01 00:41:41</td>\n", | |
" <td>2023-01-01 01:19:32</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2</td>\n", | |
" <td>2023-01-01 00:22:39</td>\n", | |
" <td>2023-01-01 01:30:45</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>95</th>\n", | |
" <td>2</td>\n", | |
" <td>2023-01-01 00:56:24</td>\n", | |
" <td>2023-01-01 01:26:29</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>96</th>\n", | |
" <td>2</td>\n", | |
" <td>2023-01-01 00:55:38</td>\n", | |
" <td>2023-01-01 01:25:34</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>97</th>\n", | |
" <td>2</td>\n", | |
" <td>2023-01-01 00:13:36</td>\n", | |
" <td>2023-01-01 00:48:23</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>98</th>\n", | |
" <td>2</td>\n", | |
" <td>2023-01-01 00:51:18</td>\n", | |
" <td>2023-01-01 01:11:18</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>99</th>\n", | |
" <td>2</td>\n", | |
" <td>2023-01-01 00:27:34</td>\n", | |
" <td>2023-01-01 01:05:05</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>100 rows × 3 columns</p>\n", | |
"</div>\n", | |
" <div class=\"colab-df-buttons\">\n", | |
"\n", | |
" <div class=\"colab-df-container\">\n", | |
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-610138c9-cd6d-4c79-bb67-3ea11f3633df')\"\n", | |
" title=\"Convert this dataframe to an interactive table.\"\n", | |
" style=\"display:none;\">\n", | |
"\n", | |
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n", | |
" <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n", | |
" </svg>\n", | |
" </button>\n", | |
"\n", | |
" <style>\n", | |
" .colab-df-container {\n", | |
" display:flex;\n", | |
" gap: 12px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert {\n", | |
" background-color: #E8F0FE;\n", | |
" border: none;\n", | |
" border-radius: 50%;\n", | |
" cursor: pointer;\n", | |
" display: none;\n", | |
" fill: #1967D2;\n", | |
" height: 32px;\n", | |
" padding: 0 0 0 0;\n", | |
" width: 32px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert:hover {\n", | |
" background-color: #E2EBFA;\n", | |
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
" fill: #174EA6;\n", | |
" }\n", | |
"\n", | |
" .colab-df-buttons div {\n", | |
" margin-bottom: 4px;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert {\n", | |
" background-color: #3B4455;\n", | |
" fill: #D2E3FC;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert:hover {\n", | |
" background-color: #434B5C;\n", | |
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
" fill: #FFFFFF;\n", | |
" }\n", | |
" </style>\n", | |
"\n", | |
" <script>\n", | |
" const buttonEl =\n", | |
" document.querySelector('#df-610138c9-cd6d-4c79-bb67-3ea11f3633df button.colab-df-convert');\n", | |
" buttonEl.style.display =\n", | |
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
"\n", | |
" async function convertToInteractive(key) {\n", | |
" const element = document.querySelector('#df-610138c9-cd6d-4c79-bb67-3ea11f3633df');\n", | |
" const dataTable =\n", | |
" await google.colab.kernel.invokeFunction('convertToInteractive',\n", | |
" [key], {});\n", | |
" if (!dataTable) return;\n", | |
"\n", | |
" const docLinkHtml = 'Like what you see? Visit the ' +\n", | |
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", | |
" + ' to learn more about interactive tables.';\n", | |
" element.innerHTML = '';\n", | |
" dataTable['output_type'] = 'display_data';\n", | |
" await google.colab.output.renderOutput(dataTable, element);\n", | |
" const docLink = document.createElement('div');\n", | |
" docLink.innerHTML = docLinkHtml;\n", | |
" element.appendChild(docLink);\n", | |
" }\n", | |
" </script>\n", | |
" </div>\n", | |
"\n", | |
"\n", | |
"<div id=\"df-74f1ad6f-29a7-4225-b688-9cda9e5418a7\">\n", | |
" <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-74f1ad6f-29a7-4225-b688-9cda9e5418a7')\"\n", | |
" title=\"Suggest charts\"\n", | |
" style=\"display:none;\">\n", | |
"\n", | |
"<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", | |
" width=\"24px\">\n", | |
" <g>\n", | |
" <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n", | |
" </g>\n", | |
"</svg>\n", | |
" </button>\n", | |
"\n", | |
"<style>\n", | |
" .colab-df-quickchart {\n", | |
" --bg-color: #E8F0FE;\n", | |
" --fill-color: #1967D2;\n", | |
" --hover-bg-color: #E2EBFA;\n", | |
" --hover-fill-color: #174EA6;\n", | |
" --disabled-fill-color: #AAA;\n", | |
" --disabled-bg-color: #DDD;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-quickchart {\n", | |
" --bg-color: #3B4455;\n", | |
" --fill-color: #D2E3FC;\n", | |
" --hover-bg-color: #434B5C;\n", | |
" --hover-fill-color: #FFFFFF;\n", | |
" --disabled-bg-color: #3B4455;\n", | |
" --disabled-fill-color: #666;\n", | |
" }\n", | |
"\n", | |
" .colab-df-quickchart {\n", | |
" background-color: var(--bg-color);\n", | |
" border: none;\n", | |
" border-radius: 50%;\n", | |
" cursor: pointer;\n", | |
" display: none;\n", | |
" fill: var(--fill-color);\n", | |
" height: 32px;\n", | |
" padding: 0;\n", | |
" width: 32px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-quickchart:hover {\n", | |
" background-color: var(--hover-bg-color);\n", | |
" box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
" fill: var(--button-hover-fill-color);\n", | |
" }\n", | |
"\n", | |
" .colab-df-quickchart-complete:disabled,\n", | |
" .colab-df-quickchart-complete:disabled:hover {\n", | |
" background-color: var(--disabled-bg-color);\n", | |
" fill: var(--disabled-fill-color);\n", | |
" box-shadow: none;\n", | |
" }\n", | |
"\n", | |
" .colab-df-spinner {\n", | |
" border: 2px solid var(--fill-color);\n", | |
" border-color: transparent;\n", | |
" border-bottom-color: var(--fill-color);\n", | |
" animation:\n", | |
" spin 1s steps(1) infinite;\n", | |
" }\n", | |
"\n", | |
" @keyframes spin {\n", | |
" 0% {\n", | |
" border-color: transparent;\n", | |
" border-bottom-color: var(--fill-color);\n", | |
" border-left-color: var(--fill-color);\n", | |
" }\n", | |
" 20% {\n", | |
" border-color: transparent;\n", | |
" border-left-color: var(--fill-color);\n", | |
" border-top-color: var(--fill-color);\n", | |
" }\n", | |
" 30% {\n", | |
" border-color: transparent;\n", | |
" border-left-color: var(--fill-color);\n", | |
" border-top-color: var(--fill-color);\n", | |
" border-right-color: var(--fill-color);\n", | |
" }\n", | |
" 40% {\n", | |
" border-color: transparent;\n", | |
" border-right-color: var(--fill-color);\n", | |
" border-top-color: var(--fill-color);\n", | |
" }\n", | |
" 60% {\n", | |
" border-color: transparent;\n", | |
" border-right-color: var(--fill-color);\n", | |
" }\n", | |
" 80% {\n", | |
" border-color: transparent;\n", | |
" border-right-color: var(--fill-color);\n", | |
" border-bottom-color: var(--fill-color);\n", | |
" }\n", | |
" 90% {\n", | |
" border-color: transparent;\n", | |
" border-bottom-color: var(--fill-color);\n", | |
" }\n", | |
" }\n", | |
"</style>\n", | |
"\n", | |
" <script>\n", | |
" async function quickchart(key) {\n", | |
" const quickchartButtonEl =\n", | |
" document.querySelector('#' + key + ' button');\n", | |
" quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n", | |
" quickchartButtonEl.classList.add('colab-df-spinner');\n", | |
" try {\n", | |
" const charts = await google.colab.kernel.invokeFunction(\n", | |
" 'suggestCharts', [key], {});\n", | |
" } catch (error) {\n", | |
" console.error('Error during call to suggestCharts:', error);\n", | |
" }\n", | |
" quickchartButtonEl.classList.remove('colab-df-spinner');\n", | |
" quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n", | |
" }\n", | |
" (() => {\n", | |
" let quickchartButtonEl =\n", | |
" document.querySelector('#df-74f1ad6f-29a7-4225-b688-9cda9e5418a7 button');\n", | |
" quickchartButtonEl.style.display =\n", | |
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
" })();\n", | |
" </script>\n", | |
"</div>\n", | |
"\n", | |
" <div id=\"id_eb84e669-b39a-46a4-9e56-7196f5fccf21\">\n", | |
" <style>\n", | |
" .colab-df-generate {\n", | |
" background-color: #E8F0FE;\n", | |
" border: none;\n", | |
" border-radius: 50%;\n", | |
" cursor: pointer;\n", | |
" display: none;\n", | |
" fill: #1967D2;\n", | |
" height: 32px;\n", | |
" padding: 0 0 0 0;\n", | |
" width: 32px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-generate:hover {\n", | |
" background-color: #E2EBFA;\n", | |
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
" fill: #174EA6;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-generate {\n", | |
" background-color: #3B4455;\n", | |
" fill: #D2E3FC;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-generate:hover {\n", | |
" background-color: #434B5C;\n", | |
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
" fill: #FFFFFF;\n", | |
" }\n", | |
" </style>\n", | |
" <button class=\"colab-df-generate\" onclick=\"generateWithVariable('scan_df')\"\n", | |
" title=\"Generate code using this dataframe.\"\n", | |
" style=\"display:none;\">\n", | |
"\n", | |
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", | |
" width=\"24px\">\n", | |
" <path d=\"M7,19H8.4L18.45,9,17,7.55,7,17.6ZM5,21V16.75L18.45,3.32a2,2,0,0,1,2.83,0l1.4,1.43a1.91,1.91,0,0,1,.58,1.4,1.91,1.91,0,0,1-.58,1.4L9.25,21ZM18.45,9,17,7.55Zm-12,3A5.31,5.31,0,0,0,4.9,8.1,5.31,5.31,0,0,0,1,6.5,5.31,5.31,0,0,0,4.9,4.9,5.31,5.31,0,0,0,6.5,1,5.31,5.31,0,0,0,8.1,4.9,5.31,5.31,0,0,0,12,6.5,5.46,5.46,0,0,0,6.5,12Z\"/>\n", | |
" </svg>\n", | |
" </button>\n", | |
" <script>\n", | |
" (() => {\n", | |
" const buttonEl =\n", | |
" document.querySelector('#id_eb84e669-b39a-46a4-9e56-7196f5fccf21 button.colab-df-generate');\n", | |
" buttonEl.style.display =\n", | |
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
"\n", | |
" buttonEl.onclick = () => {\n", | |
" google.colab.notebook.generateWithVariable('scan_df');\n", | |
" }\n", | |
" })();\n", | |
" </script>\n", | |
" </div>\n", | |
"\n", | |
" </div>\n", | |
" </div>\n" | |
], | |
"application/vnd.google.colaboratory.intrinsic+json": { | |
"type": "dataframe", | |
"variable_name": "scan_df", | |
"summary": "{\n \"name\": \"scan_df\",\n \"rows\": 100,\n \"fields\": [\n {\n \"column\": \"VendorID\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 2,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 2\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tpep_pickup_datetime\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": \"2023-01-01 00:01:55\",\n \"max\": \"2023-01-01 23:19:19\",\n \"num_unique_values\": 99,\n \"samples\": [\n \"2023-01-01 00:50:48\",\n \"2023-01-01 00:13:55\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tpep_dropoff_datetime\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": \"2023-01-01 00:27:47\",\n \"max\": \"2023-01-02 00:11:52\",\n \"num_unique_values\": 99,\n \"samples\": [\n \"2023-01-01 00:40:09\",\n \"2023-01-01 00:35:19\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" | |
} | |
}, | |
"metadata": {}, | |
"execution_count": 43 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Scan into DuckDB\n", | |
"query the scanned rows using duckdb" | |
], | |
"metadata": { | |
"id": "SyumqklHfa9b" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"scan_duck = scan.to_duckdb(table_name=\"distant_taxi_trips\") # scan into duckdb\n", | |
"scan_duck.sql(\"select * from distant_taxi_trips\")\n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "k79Ethzjfg5A", | |
"outputId": "e6ed444f-9265-4bec-b31e-2977cdbe405c" | |
}, | |
"execution_count": 49, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"┌──────────┬──────────────────────┬───────────────────────┐\n", | |
"│ VendorID │ tpep_pickup_datetime │ tpep_dropoff_datetime │\n", | |
"│ int64 │ timestamp │ timestamp │\n", | |
"├──────────┼──────────────────────┼───────────────────────┤\n", | |
"│ 2 │ 2023-01-01 00:27:12 │ 2023-01-01 00:49:56 │\n", | |
"│ 2 │ 2023-01-01 00:09:29 │ 2023-01-01 00:29:23 │\n", | |
"│ 1 │ 2023-01-01 00:13:30 │ 2023-01-01 00:44:00 │\n", | |
"│ 2 │ 2023-01-01 00:41:41 │ 2023-01-01 01:19:32 │\n", | |
"│ 2 │ 2023-01-01 00:22:39 │ 2023-01-01 01:30:45 │\n", | |
"│ 1 │ 2023-01-01 00:18:29 │ 2023-01-01 00:55:20 │\n", | |
"│ 2 │ 2023-01-01 00:25:14 │ 2023-01-01 01:00:29 │\n", | |
"│ 2 │ 2023-01-01 00:37:44 │ 2023-01-01 01:16:14 │\n", | |
"│ 2 │ 2023-01-01 00:49:24 │ 2023-01-01 01:30:59 │\n", | |
"│ 1 │ 2023-01-01 00:21:15 │ 2023-01-01 00:52:25 │\n", | |
"│ · │ · │ · │\n", | |
"│ · │ · │ · │\n", | |
"│ · │ · │ · │\n", | |
"│ 2 │ 2023-01-01 00:12:07 │ 2023-01-01 00:44:26 │\n", | |
"│ 2 │ 2023-01-01 00:47:21 │ 2023-01-01 01:30:32 │\n", | |
"│ 2 │ 2023-01-01 00:47:39 │ 2023-01-01 01:23:27 │\n", | |
"│ 1 │ 2023-01-01 00:56:13 │ 2023-01-01 01:20:11 │\n", | |
"│ 2 │ 2023-01-01 00:05:24 │ 2023-01-01 00:33:50 │\n", | |
"│ 2 │ 2023-01-01 00:56:24 │ 2023-01-01 01:26:29 │\n", | |
"│ 2 │ 2023-01-01 00:55:38 │ 2023-01-01 01:25:34 │\n", | |
"│ 2 │ 2023-01-01 00:13:36 │ 2023-01-01 00:48:23 │\n", | |
"│ 2 │ 2023-01-01 00:51:18 │ 2023-01-01 01:11:18 │\n", | |
"│ 2 │ 2023-01-01 00:27:34 │ 2023-01-01 01:05:05 │\n", | |
"├──────────┴──────────────────────┴───────────────────────┤\n", | |
"│ 100 rows (20 shown) 3 columns │\n", | |
"└─────────────────────────────────────────────────────────┘" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 49 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Query Iceberg Table\n", | |
"Use duckdb and the Iceberg extension to query iceberg" | |
], | |
"metadata": { | |
"id": "GpBp6vtB61v5" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"duckdb.sql(\"INSTALL iceberg;\")" | |
], | |
"metadata": { | |
"id": "WHTn0dl3F_6W", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 49, | |
"referenced_widgets": [ | |
"a3feb983d94d4b6b9a6900e994387d90", | |
"19c3c4eae073475188884cc8421ac4e4", | |
"832a49bbebdc49ceaba1de536c218566" | |
] | |
}, | |
"outputId": "7dd1bac4-d683-47bb-b37b-ebf40875b758" | |
}, | |
"execution_count": 50, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/plain": [ | |
"FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" | |
], | |
"application/vnd.jupyter.widget-view+json": { | |
"version_major": 2, | |
"version_minor": 0, | |
"model_id": "a3feb983d94d4b6b9a6900e994387d90" | |
} | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"duckdb.sql(\"LOAD ICEBERG;\")" | |
], | |
"metadata": { | |
"id": "FS7JJjQfFxJw" | |
}, | |
"execution_count": 51, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"query=f\"\"\"\n", | |
" SELECT *\n", | |
" FROM iceberg_scan('{warehouse_path}/{namespace}.db/{table_name}')\n", | |
"\"\"\"\n", | |
"query" | |
], | |
"metadata": { | |
"id": "0lqme_aYQMXm", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "3b67ea09-6449-4981-8358-2e11030c8077" | |
}, | |
"execution_count": 52, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"\"\\n SELECT *\\n FROM iceberg_scan('/tmp/warehouse/taxi_namespace.db/taxi_trips')\\n\"" | |
], | |
"application/vnd.google.colaboratory.intrinsic+json": { | |
"type": "string" | |
} | |
}, | |
"metadata": {}, | |
"execution_count": 52 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"\n", | |
"# Query the Iceberg table\n", | |
"result = duckdb.sql(query)" | |
], | |
"metadata": { | |
"id": "x49btKTdK8hB", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 287 | |
}, | |
"outputId": "670cd7cc-7150-4bc8-ba54-de16ead5e45f" | |
}, | |
"execution_count": 53, | |
"outputs": [ | |
{ | |
"output_type": "error", | |
"ename": "IOException", | |
"evalue": "IO Error: Cannot open file \"/tmp/warehouse/taxi_namespace.db/taxi_trips/metadata/version-hint.text\": No such file or directory", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mIOException\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-53-4ef425e70b2c>\u001b[0m in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Query the Iceberg table\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mduckdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msql\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/duckdb/__init__.py\u001b[0m in \u001b[0;36msql\u001b[0;34m(query, **kwargs)\u001b[0m\n\u001b[1;32m 455\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 456\u001b[0m \u001b[0mconn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mduckdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\":default:\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 457\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msql\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 458\u001b[0m \u001b[0m_exported_symbols\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'sql'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 459\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;31mIOException\u001b[0m: IO Error: Cannot open file \"/tmp/warehouse/taxi_namespace.db/taxi_trips/metadata/version-hint.text\": No such file or directory" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [], | |
"metadata": { | |
"id": "9mm28__GGafk" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Detach the existing \"sqlite_db\" if it exists\n", | |
"duckdb.execute(\"DETACH DATABASE IF EXISTS sqlite_db;\")\n", | |
"\n", | |
"# Attach the SQLite database to DuckDB\n", | |
"duckdb.execute(\"INSTALL sqlite_scanner;\")\n", | |
"duckdb.execute(\"LOAD sqlite_scanner;\")\n", | |
"duckdb.execute(\"ATTACH DATABASE '/tmp/warehouse/pyiceberg_catalog.db' AS sqlite_db;\")\n", | |
"\n" | |
], | |
"metadata": { | |
"id": "C3JTQUQIGlvm" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"duckdb.sql(\"select * from main.sqlite_master\")" | |
], | |
"metadata": { | |
"id": "Oir46FBqQl_O" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [], | |
"metadata": { | |
"id": "yzUrhj0v0pV6" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!ls /tmp/" | |
], | |
"metadata": { | |
"id": "dkfCydJOG1Jz" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!ls /tmp/warehouse/" | |
], | |
"metadata": { | |
"id": "HmP2N2vbG5JQ" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!cat /tmp/warehouse/pyiceberg_catalog.db" | |
], | |
"metadata": { | |
"id": "16B7kStLG9P6" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!ls /tmp/warehouse/taxi_namespace.db" | |
], | |
"metadata": { | |
"id": "nD8wz38TII7E" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!ls /tmp/warehouse/taxi_namespace.db/taxi_trips" | |
], | |
"metadata": { | |
"id": "OwXHQg9MILdW" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!ls /tmp/warehouse/taxi_namespace.db/taxi_trips/metadata" | |
], | |
"metadata": { | |
"id": "_B9FlmDeIP3o" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [], | |
"metadata": { | |
"id": "j7Jj9-4na9Ld" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!pip freeze" | |
], | |
"metadata": { | |
"id": "pl9ylWoRJSue" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [], | |
"metadata": { | |
"id": "oWO4asQXJTDG" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment