Skip to content

Instantly share code, notes, and snippets.

@pdet
Created May 25, 2023 13:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pdet/2907290725539d390df7981e799ed593 to your computer and use it in GitHub Desktop.
Save pdet/2907290725539d390df7981e799ed593 to your computer and use it in GitHub Desktop.
Compare UDFs and External Function Execution
import duckdb
import pyarrow as pa
import pandas as pd
import time
def time_function(function):
res = []
for i in range (0,5):
start_time = time.monotonic()
ans = function()
end_time = time.monotonic()
res.append(end_time-start_time)
res.sort()
print ("Time: " + str(res[2]))
print (ans)
def string_length_arrow(x):
tuples = len(x)
values = [len(i.as_py()) if i.as_py() != None else 0 for i in x]
array = pa.array(values, type=pa.int32(), size=tuples)
return array
def exec_internal():
global con
return con.sql("select sum(strlen_arrow(i)) from strings tbl(i)").fetchall()
def exec_external():
global con
arrow_table = con.sql("select i from strings tbl(i)").arrow()
arrow_column = arrow_table['i']
tuples = len(arrow_column)
values = [len(i.as_py()) if i.as_py() != None else 0 for i in arrow_column]
array = pa.array(values, type=pa.int32(), size=tuples)
arrow_tbl = pa.Table.from_arrays([array], names=['i'])
return con.sql("select sum(i) from arrow_tbl").fetchall()
con = duckdb.connect()
con.create_function('strlen_arrow', string_length_arrow, ['VARCHAR'], int, type='arrow')
con.sql("""
select
case when i != 0 and i % 42 = 0
then
NULL
else
repeat(chr((65 + (i % 26))::INTEGER), (4 + (i % 12))) end
from range(10000000) tbl(i);
""").to_view("strings")
print ("Internal:")
time_function(exec_internal)
print ("External:")
time_function(exec_external)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment