Skip to content

Instantly share code, notes, and snippets.

@mneedham
Last active April 14, 2023 05:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mneedham/84ffbaa304198886acf747942533b12a to your computer and use it in GitHub Desktop.
Save mneedham/84ffbaa304198886acf747942533b12a to your computer and use it in GitHub Desktop.
SQL Aggregate vs Aggregate Window Functions

SQL Aggregate vs Aggregate Window Functions

Queries based on the Citi bikes dataset using DuckDB.

Associated YouTube video: https://www.youtube.com/watch?v=KTIBhsIoCvk

Load the data

CREATE OR REPLACE TABLE bikeStations AS
FROM read_csv_auto('data/*.csv', types={"station_id": "VARCHAR"}) 
WHERE missing_station_information = false;

ALTER TABLE bikeStations ADD COLUMN ts TIMESTAMP;

UPDATE bikeStations 
SET ts = to_timestamp(station_status_last_reported)::TIMESTAMP AT TIME ZONE 'US/Eastern';

Overview Query

.mode line
SELECT * FROM bikeStations LIMIT 1;

Normal aggregation

.mode duckbox
SELECT station_name, 
       date_trunc('DAY', ts) AS day,
       dayName(ts) AS weekDay,
       round(avg(num_bikes_available), 2) AS avg,
       round(min(num_bikes_available), 2) AS min,
       round(max(num_bikes_available), 2) AS max,
       round(stddev(num_bikes_available), 2) AS stddev
FROM bikeStations
WHERE station_id = 128
GROUP BY ALL
ORDER BY day
LIMIT 10;

// First compute the average availability for the day

SELECT station_name, date_trunc('DAY', ts) AS day,
       round(avg(num_bikes_available) OVER (
               PARTITION BY station_id, day
            ), 2) AS avg,
       ts,
       num_bikes_available AS available
FROM bikeStations
ORDER BY day, ts
LIMIT 10;

When are the most bikes available?

WITH avgByDay AS (
    SELECT station_id, station_name,
           date_trunc('DAY', ts) AS day,
           round(avg(num_bikes_available) OVER (
                   PARTITION BY station_id, day
                ), 2) AS avg,
           ts,
           num_bikes_available AS available,
    FROM bikeStations
    ORDER BY ts, day
)

SELECT ts, available, avg
FROM avgByDay
WHERE station_id = 128 AND day = '2019-08-06'
AND available > avg 
LIMIT 10;

When are the least bikes available?

WITH avgByDay AS (
    SELECT station_id, station_name, date_trunc('DAY', ts) AS day,
           ts,
           num_bikes_available AS available,
           round(avg(num_bikes_available) OVER (
                   PARTITION BY station_id, day
                ), 2) AS avg
    FROM bikeStations
    ORDER BY ts, day
)

SELECT ts, available, avg
FROM avgByDay
WHERE station_id = 128 AND day = '2019-08-06'
AND available < avg 
ORDER BY available
LIMIT 10;

Availability at 75%ile

WITH avgByDay AS (
    SELECT station_id, station_name, date_trunc('DAY', ts) AS day, ts, 
           num_bikes_available as available,
           round(avg(num_bikes_available) OVER station, 2) AS avg,
           round(min(num_bikes_available) OVER station, 2) AS min,
           round(max(num_bikes_available) OVER station, 2) AS max,
           quantile_cont(num_bikes_available, 0.75) OVER station AS q75
    FROM bikeStations
    WINDOW station AS (
        PARTITION BY station_id, day
    )
    ORDER BY ts, day
)

SELECT ts, available, avg, min, max, q75
FROM avgByDay
WHERE station_id = 128 AND day = '2019-08-07'
AND available > q75
LIMIT 10;

Across other stations

WITH avgByDay AS (
    SELECT station_id, station_name, date_trunc('DAY', ts) AS day, ts, 
           num_bikes_available as available,
           round(avg(num_bikes_available) OVER station, 2) AS avg,
           round(min(num_bikes_available) OVER station, 2) AS min,
           round(max(num_bikes_available) OVER station, 2) AS max,
           quantile_cont(num_bikes_available, 0.75) OVER station AS q75
    FROM bikeStations
    WINDOW station AS (
        PARTITION BY station_id, day
    )
    ORDER BY ts, day
)

SELECT station_name, strftime(ts, '%H:%M:%S') AS time, 
       available, avg, min, max, q75
FROM avgByDay
WHERE ts BETWEEN '2019-08-09 17:00:00' AND '2019-08-09 18:00:00'
AND available > q75
LIMIT 10;

Framing by row

WITH movingAverage AS (
    SELECT station_id, ts, num_bikes_available AS available,
           round(avg(num_bikes_available) OVER rolling, 2) AS avg,
           lag(num_bikes_available, 1) OVER rolling AS "n-1",
           lag(num_bikes_available, 2) OVER rolling AS "n-2"           
    FROM bikeStations
    WINDOW rolling AS (        
        PARTITION BY station_id
        ORDER BY ts
        ROWS BETWEEN 2 PRECEDING AND CURRENT ROW            
    )
    ORDER BY ts 
)

SELECT ts, avg, "n-2", "n-1", available AS current
FROM movingAverage
WHERE station_id = 128 AND date_trunc('DAY', ts) = '2019-08-06'
ORDER BY ts
LIMIT 10;

Framing by range

WITH movingAverage AS (
    SELECT station_id, ts, num_bikes_available AS available,
           round(avg(num_bikes_available) OVER rolling, 2) AS avg,
           round(min(num_bikes_available) OVER rolling, 2) AS min,
           round(max(num_bikes_available) OVER rolling, 2) AS max
    FROM bikeStations
    WINDOW rolling AS (        
        PARTITION BY station_id
        ORDER BY ts
        RANGE BETWEEN INTERVAL 10 MINUTES PRECEDING AND CURRENT ROW     
    )
    ORDER BY ts 
)

SELECT ts, available, avg, min, max
FROM movingAverage
WHERE station_id = 128 AND date_trunc('DAY', ts) = '2019-08-06'
ORDER BY ts
LIMIT 10;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment