Queries based on the Citi bikes dataset using DuckDB.
Associated YouTube video: https://youtu.be/u9AtW5P0m5c
CREATE OR REPLACE TABLE bikeStations AS
FROM read_csv_auto('data/*.csv', types={"station_id": "VARCHAR"})
WHERE missing_station_information = false;
ALTER TABLE bikeStations ADD COLUMN ts TIMESTAMP;
UPDATE bikeStations
SET ts = to_timestamp(station_status_last_reported)::TIMESTAMP AT TIME ZONE 'US/Eastern';
.mode line
SELECT * FROM bikeStations LIMIT 1;
.mode duckbox
SELECT station_id, ts, row_number() OVER () AS rn
FROM bikeStations
LIMIT 10;
SELECT station_id, ts, row_number() OVER (ORDER BY ts) AS rn
FROM bikeStations
LIMIT 10;
SELECT station_id, ts, row_number() OVER () AS rn
FROM bikeStations
ORDER BY ts
LIMIT 10;
SELECT station_id, station_name, ts, num_bikes_available AS available,
row_number() OVER (PARTITION BY station_id ORDER BY ts) AS rn
FROM bikeStations
LIMIT 10;
SELECT station_id, station_name, ts, num_bikes_available AS available,
row_number() OVER (PARTITION BY station_id ORDER BY ts) AS rn
FROM bikeStations
WHERE rn = 1
LIMIT 10;
WITH bikeStationsRn AS (
SELECT station_id, station_name, ts, num_bikes_available AS available,
row_number() OVER (PARTITION BY station_id ORDER BY ts) AS rn
FROM bikeStations
)
SELECT *
FROM bikeStationsRn
WHERE rn = 1
LIMIT 10;
SELECT station_id AS id, station_name, ts,
num_bikes_available AS current,
LEAD(num_bikes_available) OVER (PARTITION BY station_id ORDER BY ts) AS next,
next - current AS delta
FROM bikeStations
ORDER BY station_id, ts
LIMIT 10;
SELECT station_id AS id, station_name, ts,
LAG(num_bikes_available) OVER (PARTITION BY station_id ORDER BY ts) AS prev,
num_bikes_available AS current,
current - prev AS delta
FROM bikeStations
ORDER BY station_id, ts
LIMIT 10;
The WINDOW clause can be used to define a named window that can be shared between multiple window functions:
SELECT station_id AS id, station_name, ts,
LAG(num_bikes_available) OVER station AS prev,
num_bikes_available AS current,
LEAD(num_bikes_available) OVER station AS next,
FROM bikeStations
WINDOW station AS (
PARTITION BY station_id
ORDER BY ts
)
ORDER BY station_id, ts
LIMIT 10;