Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
-- Window Function examples
-- PostgreSQL conference South Africa 2018
-- By Willem Booysen
-- Youtube: https://www.youtube.com/watch?v=blHEnrYwySE
-- Create database and templates for demo
DROP DATABASE IF EXISTS WindowFunctions;
CREATE DATABASE WindowFunctions;
-- Create table Payroll
DROP TABLE IF EXISTS Payroll CASCADE;
CREATE TABLE Payroll (
emp_no INTEGER,
emp_name VARCHAR(20) NOT NULL,
dept_name VARCHAR(15) NOT NULL,
salary_amt DECIMAL (8,2) NOT NULL CHECK (salary_amt > 0.00)
);
-- Populate table Payroll
INSERT INTO Payroll (emp_no, emp_name, dept_name, salary_amt ) VALUES
(1, 'Mark Stone', 'Accounting', 16000.00),
(2, 'Maria Stone', 'Accounting', 13000.00),
(3, 'Geetha Singh', 'Accounting', 13000.00),
(4, 'Richard Hathaway', 'Accounting', 14000.00),
(5, 'Joseph Bastion', 'Accounting', 14000.00),
(6, 'Arthur Prince', 'Production',12000.00),
(7, 'Adele Morse', 'Production', 13000.00),
(8, 'Sheamus O Kelly', 'Production', 24000.00),
(9, 'Sheilah Flask', 'Production', 24000.00),
(10, 'Brian James', 'Production', 16000.00),
(11, 'Adam Scott', 'Production', 16000.00),
(12, 'Maurice Moss', 'IT',12000.00),
(13, 'Roy', 'IT', 12001.00),
(14, 'Jen Barber', 'IT', 28000.00),
(15, 'Richard Hammond', 'IT', 10000.00),
(16, 'James May', 'IT', 10000.00),
(18, 'Jeremy Clarkson', 'IT', 10000.00),
(17, 'John Doe', 'IT', 100000.00) --Note how emp_no was switched around here...
;
SELECT * FROM Payroll;
-- Examples: First the traditional method
-- Basic aggregate functions, using the department as the Group
SELECT
dept_name,
COUNT(*) AS dept_employee_count,
MIN(salary_amt) AS min_dept_salary,
MAX(salary_amt) AS max_dept_salary,
AVG(salary_amt)::DECIMAL(8,2) AS average_dept_salary,
SUM(salary_amt) AS total_dept_salaries
FROM Payroll
GROUP BY dept_name
ORDER BY dept_name
;
-- The individual rows have been grouped into subsets and returned in Department Groups as single rows.
-- If we wanted this information to be compared to each salaried employee, we would normally JOIN with a CTE
WITH Dept_stats AS (
SELECT
dept_name,
COUNT(*) AS dept_employee_count,
MIN(salary_amt) AS min_dept_salary,
MAX(salary_amt) AS max_dept_salary,
AVG(salary_amt)::DECIMAL(8,2) AS average_dept_salary,
SUM(salary_amt) AS total_dept_salaries
FROM Payroll
GROUP BY dept_name
ORDER BY dept_name
)
SELECT
Payroll.*,
(Select count(*) from Payroll) AS total_employee_count,
Dept_stats.dept_employee_count,
Dept_stats.min_dept_salary,
Dept_stats.max_dept_salary,
Dept_stats.average_dept_salary,
Dept_stats.total_dept_salaries
FROM Payroll
LEFT OUTER JOIN Dept_stats ON (Payroll.dept_name = Dept_stats.dept_name)
ORDER BY Payroll.dept_name, emp_name
;
-- Window Function --> How clean is this in comparison?
SELECT
*,
COUNT(*) OVER () AS total_employee_count,
COUNT(*) OVER (PARTITION BY dept_name) AS dept_employee_count,
MIN(salary_amt) OVER (PARTITION BY dept_name) AS min_dept_salary,
MAX(salary_amt) OVER (PARTITION BY dept_name) AS max_dept_salary,
AVG(salary_amt) OVER (PARTITION BY dept_name)::DECIMAL(8,2) AS avg_dept_sal,
SUM(salary_amt) OVER (PARTITION BY dept_name) AS total_dept_salaries
FROM Payroll
ORDER BY dept_name, emp_name;
-- DEMO 1 complete
-- Basic Syntax:
-- A window function call always contains an OVER clause.
-- "( )" --> This is the Window
SELECT
*,
COUNT(*) OVER () AS total_employee_count,
COUNT(*) OVER (PARTITION BY dept_name) AS dept_employee_count
FROM Payroll
;
SELECT
*,
COUNT(*) OVER () AS total_employee_count,
COUNT(*) OVER (PARTITION BY dept_name) AS dept_employee_count
FROM Payroll
WHERE dept_name = 'IT' --> Windows into your data are affected by the WHERE clause. Windows cannot work outside your base data set.
;
SELECT
*,
COUNT(*) OVER () AS total_employee_count,
COUNT(*) OVER (PARTITION BY dept_name) AS dept_employee_count,
MIN(salary_amt) OVER (PARTITION BY dept_name) AS min_dept_salary,
MAX(salary_amt) OVER (PARTITION BY dept_name) AS max_dept_salary,
AVG(salary_amt) OVER (PARTITION BY dept_name)::DECIMAL(8,2) AS avg_dept_sal,
SUM(salary_amt) OVER (PARTITION BY dept_name) AS total_dept_salaries
FROM Payroll
ORDER BY dept_name, emp_name;
-- And now for some handy comparison figures...
SELECT
*,
(salary_amt / (SUM(salary_amt) OVER (PARTITION BY dept_name)) * 100)::DECIMAL(18,2) AS Dept_Percentage,
(salary_amt / (SUM(salary_amt) OVER () ) * 100)::DECIMAL(18,2) AS Company_Percentage
FROM Payroll
ORDER BY dept_name, dept_percentage;
-- ORDER BY examples here --> Skipping to ROW_NUMBER for the moment as this best illustrates the various Window orders.
-- and how a Window order is not affected by the outer SELECT order.
SELECT
*,
-- Note the difference between row_number and the emp_no in terms of when it was captured.
ROW_NUMBER() OVER () AS "Base Row No",
-- No Partition, just an ORDER BY, thus the whole base resultset is used.
ROW_NUMBER() OVER (ORDER BY salary_amt) AS "Salary Row No",
-- Order each partition first, then assign row numbers.
ROW_NUMBER() OVER (PARTITION BY dept_name ORDER BY salary_amt) AS "Dept,Salary Row No"
FROM Payroll
ORDER BY emp_no;
-- DEMO 2 complete
-- Row_Order, Rank & Dense_rank:
-- The natural order of the table as we declared it.
SELECT *
FROM Payroll
ORDER BY emp_no;
-- ROW_NUMBER a really good way to explain the relationship between Windows and Partitions:
SELECT *,
-- Row_Number assigns a unique integer to each row within your partition within your window.
ROW_NUMBER() OVER (), -- Note the difference between row_number and the emp_no in terms of when it was captured.
ROW_NUMBER() OVER (ORDER BY salary_amt), -- No Partition, just an ORDER BY, thus the whole base resultset is used.
ROW_NUMBER() OVER (PARTITION BY dept_name),
ROW_NUMBER() OVER (PARTITION BY dept_name ORDER BY salary_amt),
-- Ranks --> Equal values are ranked the same, creating gaps in numbering
RANK() OVER (), -- Ranks are useless without an ORDER BY
RANK() OVER (PARTITION BY dept_name),
RANK() OVER (ORDER BY salary_amt),
RANK() OVER (PARTITION BY dept_name ORDER BY salary_amt),
-- Dense_Ranks --> Equal values are ranked the same, without gaps in numbering
DENSE_RANK() OVER (),
DENSE_RANK() OVER (PARTITION BY dept_name),
DENSE_RANK() OVER (ORDER BY salary_amt),
DENSE_RANK() OVER (PARTITION BY dept_name ORDER BY salary_amt)
FROM Payroll
ORDER BY emp_no
;
-- GREAT! Let's find the top 2 earners in every department
-- =======================================================
-- Solution with ROW_NUMBERS
WITH ctePayroll AS (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY dept_name ORDER BY salary_amt DESC) AS dept_row
FROM Payroll
--WHERE ROW_NUMBER() OVER (PARTITION BY dept_name ORDER BY salary_amt DESC) <= 2 --> WFs not allowed in WHERE clauses
)
SELECT *
FROM ctePayroll
WHERE dept_row <= 2
; --> BUT hold on! What about people who earn the same salaries?
-- Solution with RANK
WITH ctePayroll AS (
SELECT *,
RANK() OVER (PARTITION BY dept_name ORDER BY salary_amt DESC) AS dept_rank
FROM Payroll
)
SELECT *
FROM ctePayroll
WHERE dept_rank <= 2
; --> HOWEVER, Production is just plain wrong???
-- Solution with DENSE_RANK
WITH ctePayroll AS (
SELECT *,
DENSE_RANK() OVER (PARTITION BY dept_name ORDER BY salary_amt DESC) AS dept_rank
FROM Payroll
)
SELECT *
FROM ctePayroll
WHERE dept_rank <= 2
--ORDER BY dept_rank --> Note how we can order based on CTE name, cannot do this without CTE
;
-- DEMO 3 complete
-- percent_rank and cume_rank
WITH base_data AS (
SELECT generate_series(1,10) AS series
)
SELECT *,
(PERCENT_RANK() OVER (ORDER BY series))::DEC(8,2), -- to determine the relative standing of a value within a set or rows
(CUME_DIST() OVER (ORDER BY series))::DEC(8,2), -- Perfect for a 10-90 split in data
NTILE(2) OVER (ORDER BY series), -- Split your dataset into x parts
NTILE(3) OVER (ORDER BY series)
FROM base_data
ORDER BY series
;
--ntile with partitions
SELECT *,
--ntile(bucket size)
ntile(2) OVER (PARTITION BY dept_name ORDER BY salary_amt),
ntile(3) OVER (PARTITION BY dept_name ORDER BY salary_amt)
FROM Payroll
ORDER BY dept_name, salary_amt;
-- DEMO 4 complete
-- LAG(), LEAD() --> Offset from current row
-- Let's return the employee names before and after the current row:
SELECT emp_no,
dept_name,
emp_name,
LAG(emp_name) OVER (PARTITION BY dept_name ORDER BY emp_no) AS "Previous Employee",
LEAD(emp_name) OVER (PARTITION BY dept_name ORDER BY emp_no) AS "Next Employee"
FROM Payroll
ORDER BY emp_no;
-- The syntax of LAG and LEAD
-- LAG/LEAD (column or expression [,offset] [,default])
SELECT *,
LAG(emp_name) OVER (PARTITION BY dept_name ORDER BY emp_no) AS "Previous Employee",
LAG(emp_name, 2) OVER (PARTITION BY dept_name ORDER BY emp_no) AS "Previous Offset 2",
LAG(emp_name, 2, 'nada...') OVER (PARTITION BY dept_name ORDER BY emp_no) AS "Previous Offset 2 with Defaults"
FROM Payroll
ORDER BY emp_no;
-- FIRST_VALUE(), LAST_VALUE() ... Offset relative to beginning/end of the window frame
SELECT *,
FIRST_VALUE(emp_name) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC) AS "Higest Earner",
MAX(salary_amt) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC) AS "Highest Salary",
LAST_VALUE(emp_name) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC) AS "Lowest Earner", --> Notice how LAST and MIN move with Window Frame
MIN(salary_amt) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC) AS "Lowest Salary",
LAST_VALUE(emp_name) OVER (PARTITION BY dept_name) AS "Lowest Earner for real",
MIN(salary_amt) OVER (PARTITION BY dept_name) "Lowest Salary for real"
FROM Payroll
ORDER BY dept_name, salary_amt DESC
;
-- Now the previous one failed miserably, so why the Highest Earner worked, but not the Lowest?
-- Because the ORDER BY initiates the Roller Blind effect, thus MIN/MAX/FIRST/LAST is calculated as the Window Frame grows.
-- Demo Complete
-- UNBOUNDED PRECEDING and UNBOUNDED FOLLOWING
-- and.. RANGE vs ROWS
SELECT *,
LAST_VALUE(emp_name) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC) AS "ORDER BY - DEFAULT BEHAVIOUR",
LAST_VALUE(emp_name) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC
RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS "Lowest Earner - RANGE to Current Row", --> So ORDER BY default is RANGE
LAST_VALUE(emp_name) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS "Lowest Earner - ROWS to Current Row", --> Overriding default behaviour here
LAST_VALUE(emp_name) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC
ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS "Lowest Earner" --> Window functions are easy, there's almost no syntax or variations to remember...
FROM Payroll
;
-- The catch-all phrase "ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING"
SELECT *,
FIRST_VALUE(emp_name) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS "Higest Earner",
LAST_VALUE(emp_name) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC
ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS "Lowest Earner"
FROM Payroll
;
-- To clearly illustrate the difference between ROWS and RANGE:
SELECT *,
SUM(salary_amt) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC
RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS "Running Totals with RANGE",
SUM(salary_amt) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS "Running Totals with ROWS" --> BTW This is how we do running totals easily.
FROM Payroll
;
-- Getting really specific about what rows to include...
SELECT *,
sum(salary_amt) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC
ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING) AS "The rest",
sum(salary_amt) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC
ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS "Prev, Current & Next",
sum(salary_amt) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC
ROWS BETWEEN 1 FOLLOWING AND 2 FOLLOWING) AS "Next 2"
FROM Payroll
;
-- Demo complete
/*
And now for Running Totals...
*/
DROP TABLE IF EXISTS running_totals;
CREATE TABLE running_totals AS (
SELECT generate_series(1,5000) AS my_id,
(random()* 100)::decimal(15,2) AS Amount
)
;
-- Let's take a look at our data
SELECT * FROM running_totals;
SELECT count(*), sum(amount)::money FROM running_totals;
-- Traditional method
SELECT t1.my_id,
t1.amount,
SUM(t2.amount) AS running_total
FROM running_totals t1
INNER JOIN running_totals t2 ON t1.my_id >= t2.my_id
GROUP BY t1.my_id, t1.amount
ORDER BY t1.my_id;
-- Window Functions
SELECT *,
SUM(amount) OVER (ORDER BY my_id ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
FROM running_totals
ORDER BY my_id;
--- DEMO COMPLETE
-- Watch out!
-- Issues with Distinct...
SELECT
*,
COUNT(DISTINCT dept_name) OVER ()
FROM Payroll
ORDER BY dept_name, emp_name;
-- You'd have to use GROUP BY (and list every column in the table), or some trickery...
WITH dept_stats AS (
SELECT
COUNT(DISTINCT dept_name) AS dept_count
FROM Payroll
)
SELECT *,
(SELECT * FROM dept_stats)
FROM Payroll;
--Using Window Funcitons in WHERE ... You shall not pass!
SELECT *,
ROW_NUMBER() OVER (PARTITION BY dept_name ORDER BY salary_amt DESC) AS dept_row
FROM Payroll
WHERE ROW_NUMBER() OVER (PARTITION BY dept_name ORDER BY salary_amt DESC) <= 2 --> WFs not allowed in WHERE clauses
;
-- thanks :)
@motdde

This comment has been minimized.

Copy link

@motdde motdde commented Oct 27, 2020

I started learning Postgress a few days ago, precisely the 25th of October. Your talk on windows function is enlightening for a non-accountant 🙂. Kindly share the link to the slides.

@IllusiveMilkman

This comment has been minimized.

Copy link
Owner Author

@IllusiveMilkman IllusiveMilkman commented Oct 27, 2020

@jayadevanm

This comment has been minimized.

Copy link

@jayadevanm jayadevanm commented Apr 24, 2021

Great examples. The one on how row_number and rank may not give the correct output when getting top 2 earners demonstrates clearly the differences between row_number, rank and dense_rank.

@IllusiveMilkman

This comment has been minimized.

Copy link
Owner Author

@IllusiveMilkman IllusiveMilkman commented Apr 29, 2021

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment