Skip to content

Instantly share code, notes, and snippets.

@pdet
Created May 25, 2023 11:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pdet/ebd201475581756c29e4533a8fa4106e to your computer and use it in GitHub Desktop.
Save pdet/ebd201475581756c29e4533a8fa4106e to your computer and use it in GitHub Desktop.
Compare DuckDB UDFS: Built-In UDFs vs PyArrow UDFs
import duckdb
import pyarrow as pa
import pandas as pd
import time
import pyarrow.compute as pc
def time_function(function):
res = []
for i in range (0,5):
start_time = time.monotonic()
ans = function()
end_time = time.monotonic()
res.append(end_time-start_time)
res.sort()
print ("Time: " + str(res[2]))
print (ans)
def exec_native():
global con
return con.sql("select sum(add_native(i)) from numbers").fetchall()
def exec_arrow():
global con
return con.sql("select sum(add_arrow(i)) from numbers").fetchall()
def add_native(x):
return x + 1
def add_arrow(x):
return pc.add(x,1)
con = duckdb.connect()
con.create_function('add_native', add_native, ['BIGINT'], 'BIGINT', type='native')
con.create_function('add_arrow', add_arrow, ['BIGINT'], 'BIGINT', type='arrow')
con.sql("""
select
i
from range(10000000) tbl(i);
""").to_view("numbers")
native_res = con.sql("select add_native(i) from numbers tbl(i)").fetchall()
arrow_res = con.sql("select add_arrow(i) from numbers tbl(i)").fetchall()
assert native_res == arrow_res
# Benchmark native UDF
print ("Built-In:")
time_function(exec_native)
print ("PyArrow:")
print(time_function(exec_arrow))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment