Skip to content

Instantly share code, notes, and snippets.

@samukweku
Last active November 30, 2022 20:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save samukweku/c2a83fb37cd8d43e3f0e7742db8d653b to your computer and use it in GitHub Desktop.
Save samukweku/c2a83fb37cd8d43e3f0e7742db8d653b to your computer and use it in GitHub Desktop.
Code for pydata global conditional_join talk
# data is from DuckDB's github repo
# https://github.com/duckdb/duckdb/tree/master/benchmark/micro/join
import pandas as pd; import numpy as np; import janitor as jn; import duckdb
query = """SELECT SETSEED(0.8675309);
CREATE TABLE events AS (
SELECT *,
"start" + INTERVAL (CASE WHEN random() < 0.1 THEN 120 ELSE (5 + round(random() * 50, 0)::BIGINT) END) MINUTE
AS "end"
FROM (
SELECT id,
'Event ' || id::VARCHAR as "name",
(5 + round(random() * 5000, 0)::BIGINT) AS audience,
'1992-01-01'::TIMESTAMP
+ INTERVAL (round(random() * 40 * 365, 0)::BIGINT) DAY
+ INTERVAL (round(random() * 23, 0)::BIGINT) HOUR
AS "start",
'Sponsor ' || (1 + round(random() * 10, 0)::BIGINT) AS sponsor
FROM range(1, 30000) tbl(id)
) q
);"""
con = duckdb.connect()
con.execute(query) # create table
# duckdb computation
event_count = """SELECT COUNT(*) FROM (
SELECT r.id, s.id
FROM events r, events s
WHERE r.start <= s.end AND r.end >= s.start
AND r.id <> s.id
) q2;"""
%timeit con.execute(event_count)
# create Pandas dataframe and run conditional_join
events = con.execute("select * from events").df()
%%timeit
(events
.conditional_join(
events,
("start", "end", "<="),
("end", "start", ">="),
("id", "id", "!="),
use_numba = False)
)
%%timeit
(events
.conditional_join(
events,
("start", "end", "<="),
("end", "start", ">="),
("id", "id", "!="),
use_numba = True)
)
##### Tax audits
query = """CREATE TYPE surname_t AS ENUM (
'Smith',
'Johnson',
'Williams',
'Jones',
'Brown',
'Davis',
'Miller',
'Wilson',
'Moore',
'Taylor',
'Anderson',
'Thomas',
'Jackson',
'White',
'Harris',
'Martin',
'Thompson',
'Garcia',
'Martinez',
'Robinson'
);
SELECT SETSEED(0.8675309);
CREATE TABLE employees AS
SELECT
facts.id AS id,
surname AS "name",
dept,
salary,
(salary / 10 - CASE WHEN random() <= 0.01 THEN (10 + 1) ELSE 0 END)::INTEGER AS tax
FROM (
SELECT
id,
enum_range(NULL::surname_t)[(round(random() * 19))::INTEGER] AS surname,
round(random() * 5)::INTEGER AS dept,
100 * id AS salary
FROM (SELECT UNNEST(range(1, 10000000))) tbl(id)
) facts
;"""
con = duckdb.connect()
con.execute(query)
counts = """SELECT COUNT(*) FROM (
SELECT r.id, s.id
FROM employees r, employees s
WHERE r.salary < s.salary AND r.tax > s.tax
) q1;"""
%timeit con.execute(counts)
# creata Pandas dataframe
employees = con.execute("select * from employees").df()
%%timeit
(employees
.conditional_join(
employees,
('salary', 'salary', '<'),
('tax', 'tax', '>'),
use_numba = True)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment