mniehoff/DuckDB_in_Databricks.ipynb

## DuckDB_in_Databricks.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              DuckDB_in_Databricks.ipynb
            
          
      Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## DuckDB_in_Databricks.py
# Databricks notebook source
# MAGIC %md
# MAGIC # Use DuckDB to Read data in a Databricks Unity Catalog
# MAGIC
# MAGIC Motivation: Databricks is a great platform when it comes to data management, governance and etc (mostly due to the unity catalog). But Spark as an engine is just ok'ish, especially when data is not realy big. New engines like polars or duckdb are better suited for this.
# MAGIC
# MAGIC Goal: Process data that is stored and managed in unity data catalog with duckdb.
# MAGIC
# MAGIC Ideas:
# MAGIC
# MAGIC 1. the obvious: read the data with spark, convert the data to arrow, use duckdb
# MAGIC 2. the direct: read the data directly with duckdb, using temporary table credentials provide by databricks
# MAGIC

# COMMAND ----------

# MAGIC %md
# MAGIC ## Prep
# MAGIC Install dependencies:
# MAGIC - newest databricks-sdk that include the methods to retrieve temporary credential tokens
# MAGIC - duckdb
# MAGIC - azure sdk to read files directly

# COMMAND ----------

# MAGIC %pip install --upgrade databricks-sdk duckdb

# COMMAND ----------

dbutils.library.restartPython()

# COMMAND ----------

import pyarrow as pa
import duckdb
from pyspark.sql import DataFrame
from databricks.sdk import WorkspaceClient
from databricks.sdk.service.catalog import TableOperation
import pprint
import re

# COMMAND ----------

# MAGIC %md
# MAGIC ## Read data with Spark
# MAGIC Read the data with Spark, convert to Arrow.
# MAGIC
# MAGIC Works without Unity Catalog, but data must fit into the memory of the driver node

# COMMAND ----------

# read the data with spark
spark_df = spark.read.table("samples.nyctaxi.trips")
# register as view for the the spark queries below
spark_df.createOrReplaceTempView("trips")

# create an arrow table from the spark dataframe that can be queried using duckdb
# the name of the variable here is the name of the table in duckdb
trips = spark_df.toArrow()

# COMMAND ----------

# MAGIC %md
# MAGIC toArrow() is a new method added in Spark 4.
# MAGIC Spark 4 is not yet released, but Databricks regularly adds new (unreleased) features from the open-source version to their runtime.
# MAGIC
# MAGIC The methods works like the toPandas() method, but instead of returning a pandas DataFrame, it returns an Arrow Table. Also the same limitations apply: like with toPandas() all data is loaded into memory of the driver node. Therefor this will not work for Datasets larger than memory.
# MAGIC That said: arrow is a bit more efficient than pandas in terms of memory usage, so it might work for a bit larger datasets than with pandas. It will definitly be faster in terms of performance for the conversation.
# MAGIC
# MAGIC Without toArrow(), you would have to do:
# MAGIC
# MAGIC pa.Table.from_batches(spark_df._collect_as_arrow())

# COMMAND ----------

# MAGIC %md
# MAGIC ### Count

# COMMAND ----------

duckdb.sql("SELECT COUNT (*) FROM trips")

# COMMAND ----------

display(
    spark.sql("SELECT COUNT (*) FROM trips")
)

# COMMAND ----------

# MAGIC %md
# MAGIC ### Some Groups/Aggregations
# MAGIC Average trip distance and fare amount by pickup zip code

# COMMAND ----------

sql = """
SELECT pickup_zip, avg(trip_distance), avg(fare_amount) FROM trips GROUP BY pickup_zip ORDER BY pickup_zip LIMIT 10
"""

# COMMAND ----------

duckdb.sql(sql)

# COMMAND ----------

display(
    spark.sql(sql)
)

# COMMAND ----------

# MAGIC %md
# MAGIC ### From Arrow/Duckdb back to Spark DataFrame

# COMMAND ----------

# MAGIC %md
# MAGIC In Spark 4 createDataFrame accepts an arrow table as argument.
# MAGIC The feature is available from Databricks Runtime 16.x and newer.
# MAGIC For older runtimes, duckdb can also convert to pandas (slower)

# COMMAND ----------

# using pandas as intermediate
result = duckdb.sql(sql).df() # df creates a pandas dataframe
display(
    spark.createDataFrame(result)
)

# COMMAND ----------

# using arrow as intermediate
# needs Databricks Runtime 16 or newer
result = duckdb.sql(sql).arrow()
display(
    spark.createDataFrame(result)
)

# COMMAND ----------

# MAGIC %md
# MAGIC # Read data directly from UnityCatalog

# COMMAND ----------

# MAGIC %md
# MAGIC Instead of using spark to read from the unity catalog, we query the underlying data directly with duckdb.
# MAGIC
# MAGIC This also works outside of databricks

# COMMAND ----------

# MAGIC %md
# MAGIC ## Get the temporary credential

# COMMAND ----------

w = WorkspaceClient(
    host=spark.conf.get(
        "spark.databricks.workspaceUrl"
    )
)

def get_temporary_credentials_for_table(table: str):
    table_id = w.tables.get(table).table_id
    return w.temporary_table_credentials.generate_temporary_table_credentials(table_id=table_id, operation=TableOperation.READ)

# COMMAND ----------

cred = get_temporary_credentials_for_table("samples.nyctaxi.trips")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Store the credential in duckdb

# COMMAND ----------

storage_account_name = re.search('@(.*).dfs.', cred.url).group(1)
sql = f"""
SET azure_transport_option_type = 'curl';
CREATE OR REPLACE SECRET (
    TYPE AZURE,
    CONNECTION_STRING 'AccountName={storage_account_name};SharedAccessSignature={cred.azure_user_delegation_sas.sas_token}'
);
"""
duckdb.sql(sql)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Read from duckdb

# COMMAND ----------

sql = f"""
SELECT * FROM delta_scan("{cred.url}") LIMIT 10;
"""
duckdb.sql(sql).show()
	# Databricks notebook source
	# MAGIC %md
	# MAGIC # Use DuckDB to Read data in a Databricks Unity Catalog
	# MAGIC
	# MAGIC Motivation: Databricks is a great platform when it comes to data management, governance and etc (mostly due to the unity catalog). But Spark as an engine is just ok'ish, especially when data is not realy big. New engines like polars or duckdb are better suited for this.
	# MAGIC
	# MAGIC Goal: Process data that is stored and managed in unity data catalog with duckdb.
	# MAGIC
	# MAGIC Ideas:
	# MAGIC
	# MAGIC 1. the obvious: read the data with spark, convert the data to arrow, use duckdb
	# MAGIC 2. the direct: read the data directly with duckdb, using temporary table credentials provide by databricks
	# MAGIC

	# COMMAND ----------

	# MAGIC %md
	# MAGIC ## Prep
	# MAGIC Install dependencies:
	# MAGIC - newest databricks-sdk that include the methods to retrieve temporary credential tokens
	# MAGIC - duckdb
	# MAGIC - azure sdk to read files directly

	# COMMAND ----------

	# MAGIC %pip install --upgrade databricks-sdk duckdb

	# COMMAND ----------

	dbutils.library.restartPython()

	# COMMAND ----------

	import pyarrow as pa
	import duckdb
	from pyspark.sql import DataFrame
	from databricks.sdk import WorkspaceClient
	from databricks.sdk.service.catalog import TableOperation
	import pprint
	import re

	# COMMAND ----------

	# MAGIC %md
	# MAGIC ## Read data with Spark
	# MAGIC Read the data with Spark, convert to Arrow.
	# MAGIC
	# MAGIC Works without Unity Catalog, but data must fit into the memory of the driver node

	# COMMAND ----------

	# read the data with spark
	spark_df = spark.read.table("samples.nyctaxi.trips")
	# register as view for the the spark queries below
	spark_df.createOrReplaceTempView("trips")

	# create an arrow table from the spark dataframe that can be queried using duckdb
	# the name of the variable here is the name of the table in duckdb
	trips = spark_df.toArrow()

	# COMMAND ----------

	# MAGIC %md
	# MAGIC toArrow() is a new method added in Spark 4.
	# MAGIC Spark 4 is not yet released, but Databricks regularly adds new (unreleased) features from the open-source version to their runtime.
	# MAGIC
	# MAGIC The methods works like the toPandas() method, but instead of returning a pandas DataFrame, it returns an Arrow Table. Also the same limitations apply: like with toPandas() all data is loaded into memory of the driver node. Therefor this will not work for Datasets larger than memory.
	# MAGIC That said: arrow is a bit more efficient than pandas in terms of memory usage, so it might work for a bit larger datasets than with pandas. It will definitly be faster in terms of performance for the conversation.
	# MAGIC
	# MAGIC Without toArrow(), you would have to do:
	# MAGIC
	# MAGIC pa.Table.from_batches(spark_df._collect_as_arrow())

	# COMMAND ----------

	# MAGIC %md
	# MAGIC ### Count

	# COMMAND ----------

	duckdb.sql("SELECT COUNT (*) FROM trips")

	# COMMAND ----------

	display(
	spark.sql("SELECT COUNT (*) FROM trips")
	)

	# COMMAND ----------

	# MAGIC %md
	# MAGIC ### Some Groups/Aggregations
	# MAGIC Average trip distance and fare amount by pickup zip code

	# COMMAND ----------

	sql = """
	SELECT pickup_zip, avg(trip_distance), avg(fare_amount) FROM trips GROUP BY pickup_zip ORDER BY pickup_zip LIMIT 10
	"""

	# COMMAND ----------

	duckdb.sql(sql)

	# COMMAND ----------

	display(
	spark.sql(sql)
	)

	# COMMAND ----------

	# MAGIC %md
	# MAGIC ### From Arrow/Duckdb back to Spark DataFrame

	# COMMAND ----------

	# MAGIC %md
	# MAGIC In Spark 4 createDataFrame accepts an arrow table as argument.
	# MAGIC The feature is available from Databricks Runtime 16.x and newer.
	# MAGIC For older runtimes, duckdb can also convert to pandas (slower)

	# COMMAND ----------

	# using pandas as intermediate
	result = duckdb.sql(sql).df() # df creates a pandas dataframe
	display(
	spark.createDataFrame(result)
	)

	# COMMAND ----------

	# using arrow as intermediate
	# needs Databricks Runtime 16 or newer
	result = duckdb.sql(sql).arrow()
	display(
	spark.createDataFrame(result)
	)

	# COMMAND ----------

	# MAGIC %md
	# MAGIC # Read data directly from UnityCatalog

	# COMMAND ----------

	# MAGIC %md
	# MAGIC Instead of using spark to read from the unity catalog, we query the underlying data directly with duckdb.
	# MAGIC
	# MAGIC This also works outside of databricks

	# COMMAND ----------

	# MAGIC %md
	# MAGIC ## Get the temporary credential

	# COMMAND ----------

	w = WorkspaceClient(
	host=spark.conf.get(
	"spark.databricks.workspaceUrl"
	)
	)

	def get_temporary_credentials_for_table(table: str):
	table_id = w.tables.get(table).table_id
	return w.temporary_table_credentials.generate_temporary_table_credentials(table_id=table_id, operation=TableOperation.READ)

	# COMMAND ----------

	cred = get_temporary_credentials_for_table("samples.nyctaxi.trips")

	# COMMAND ----------

	# MAGIC %md
	# MAGIC ## Store the credential in duckdb

	# COMMAND ----------

	storage_account_name = re.search('@(.*).dfs.', cred.url).group(1)
	sql = f"""
	SET azure_transport_option_type = 'curl';
	CREATE OR REPLACE SECRET (
	TYPE AZURE,
	CONNECTION_STRING 'AccountName={storage_account_name};SharedAccessSignature={cred.azure_user_delegation_sas.sas_token}'
	);
	"""
	duckdb.sql(sql)

	# COMMAND ----------

	# MAGIC %md
	# MAGIC ## Read from duckdb

	# COMMAND ----------

	sql = f"""
	SELECT * FROM delta_scan("{cred.url}") LIMIT 10;
	"""
	duckdb.sql(sql).show()