ryanguill/postgres-pivot.md

## postgres-pivot.md

      
    Raw
  

              postgres-pivot.md
            
          
/*
================================================================================
Pivot example with variable number of columns in the output.
================================================================================

example data is straight forward, imagine a table with a customer identifier, 
an invoice date and an amount.
*/

DROP TABLE IF EXISTS invoice;
CREATE TEMPORARY TABLE invoice (
  id serial not null primary key,
  customer_id int not null,
  invoice_date date not null,
  amount int not null
);


INSERT INTO invoice (customer_id, invoice_date, amount)
values
  (2, '2022-06-16 04:11:50', 8706)
, (2, '2022-02-12 15:26:53', 4163)
, (2, '2022-12-05 23:30:36', 3473)
, (2, '2022-11-01 21:16:45', 5952)
, (2, '2022-10-03 09:35:34', 4170)
, (2, '2022-06-09 03:22:14', 4818)
, (1, '2022-12-02 09:15:26', 463)
, (1, '2022-02-19 09:58:43', 2944)
, (1, '2022-12-03 12:38:20', 5879)
, (2, '2022-01-13 10:54:31', 4294)
, (2, '2022-01-28 14:03:22', 2557)
, (1, '2022-07-10 19:06:52', 6506)
, (2, '2022-02-18 11:50:49', 1105)
, (2, '2022-08-29 06:54:49', 6794)
, (2, '2022-04-15 07:00:34', 2136)
, (1, '2022-07-13 23:30:07', 1421)
, (1, '2022-12-29 12:24:54', 5460)
, (1, '2022-06-20 23:12:59', 4217)
, (1, '2022-04-15 23:47:50', 2695)
, (1, '2022-11-22 05:51:11', 3208)
, (2, '2022-04-11 12:29:28', 1921)
, (2, '2022-01-09 11:51:56', 6512)
, (2, '2022-07-10 23:14:41', 1658)
, (1, '2022-06-02 16:44:31', 3348)
, (1, '2022-11-08 11:15:59', 467)
, (2, '2022-09-01 20:13:31', 9222)
, (2, '2022-09-02 22:11:56', 7029)
, (2, '2022-06-07 11:22:30', 9933)
, (1, '2022-04-11 20:58:57', 2284)
, (2, '2022-12-13 06:30:01', 7945)
, (2, '2022-10-19 17:16:21', 8940)
, (1, '2022-02-16 06:57:09', 1202)
, (2, '2022-01-17 22:02:24', 7567)
, (2, '2022-11-16 17:02:20', 545)
, (2, '2022-12-13 12:40:18', 6266)
, (2, '2022-07-31 00:12:44', 9151)
, (2, '2022-05-20 03:14:10', 7798)
, (1, '2022-04-30 04:32:55', 420)
, (2, '2022-12-10 02:21:36', 9770)
, (1, '2022-09-28 15:55:24', 14)
, (2, '2022-09-29 05:34:56', 2170)
, (2, '2022-09-12 22:58:57', 2598)
, (2, '2022-01-16 09:54:42', 1100)
, (1, '2022-06-26 09:55:05', 7256)
, (1, '2022-02-05 10:42:52', 6024)
, (1, '2022-10-19 16:30:23', 5733)
, (1, '2022-12-28 02:08:37', 2645)
, (2, '2022-02-19 14:22:03', 7315)
, (1, '2022-10-28 05:39:54', 9733)
;

✓

✓


49 rows affected


/*
I'm going to explain the moving parts one by one and we will bring it all 
together at the end.

First thing we need to do is gather all of the data we want to use as 
columns in our pivoted output, plus any data we want to break by on 
those columns.
Lets just do the first part to start with and then we will come back and show 
how to break by other data.

So lets start by just trying to sum all of the amounts by month and have a 
column for each month.  This data is for the year 2022, and I want to make sure
we dont miss any empty months in our output where we might have had no revenue, 
so we will use generate_series to get a data point for each month.
*/

SELECT
  generate_series::DATE dte
FROM GENERATE_SERIES('2022-01-01'::DATE, '2022-12-31', '1 month')

| dte        |
| :--------- |
| 2022-01-01 |
| 2022-02-01 |
| 2022-03-01 |
| 2022-04-01 |
| 2022-05-01 |
| 2022-06-01 |
| 2022-07-01 |
| 2022-08-01 |
| 2022-09-01 |
| 2022-10-01 |
| 2022-11-01 |
| 2022-12-01 |


/*
Now, lets take our columns and get the data for each - this will give us a 
normal relational dataset where our pivoted "columns" will be their own 
column and the data for each will be another column.

We select from our columns data set and LEFT OUTER JOIN so that if there was 
no data for that column we would still get it in our output. 
Notice that March does not have any amount, but we still have the row.
*/


WITH columns AS (
  SELECT
    generate_series::DATE dte
  FROM GENERATE_SERIES('2022-01-01'::DATE, '2022-12-31', '1 month')
)
SELECT
    columns.dte
  , SUM(COALESCE(invoice.amount, 0)) total
FROM columns
LEFT OUTER
  JOIN invoice
  ON DATE_PART('year', invoice_date) = DATE_PART('year', columns.dte)
  AND DATE_PART('month', invoice_date) = DATE_PART('month', columns.dte)
GROUP BY
    columns.dte
ORDER BY
    columns.dte


dte        | total
:--------- | ----:
2022-01-01 | 22030
2022-02-01 | 22753
2022-03-01 |     0
2022-04-01 |  9456
2022-05-01 |  7798
2022-06-01 | 38278
2022-07-01 | 18736
2022-08-01 |  6794
2022-09-01 | 21033
2022-10-01 | 28576
2022-11-01 | 10172
2022-12-01 | 41901


/*
Now comes the secret sauce, the JSONB_OBJECT_AGG() that makes our pivot work.  
We aggregate our data together, passing the "column" we want to use as the 
first argument and the data we want to aggregate using that column as 
the second argument.

Basically the first argument is your pivoted "column" column from the data set 
(or a formatted version of it, like in our example) and the second arg is 
the "cell" of data you want for that "column".

You get a json object with a key for each "column" and its value being its data.
(Remember to scroll down for a more complicated example where we are
aggregating this data by customer)
*/
WITH columns AS (
  SELECT
    generate_series::DATE dte
  FROM GENERATE_SERIES('2022-01-01'::DATE, '2022-12-31', '1 month')
)
, data AS (
  SELECT
      columns.dte
    , SUM(COALESCE(invoice.amount, 0)) total
  FROM columns
  LEFT OUTER
    JOIN invoice
    ON DATE_PART('year', invoice_date) = DATE_PART('year', columns.dte)
    AND DATE_PART('month', invoice_date) = DATE_PART('month', columns.dte)
  GROUP BY
      columns.dte
)
SELECT
  JSONB_OBJECT_AGG(TO_CHAR(dte, 'YYYY-MM'), total) pivotData
FROM
  data;

| pivotdata                                                                                                                                                                                                         |
| :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| {"2022-01": 22030, "2022-02": 22753, "2022-03": 0, "2022-04": 9456, "2022-05": 7798, "2022-06": 38278, "2022-07": 18736, "2022-08": 6794, "2022-09": 21033, "2022-10": 28576, "2022-11": 10172, "2022-12": 41901} |


/*
If your application can use the json object as it is you can stop 
at the last query, but if you need to get actual columns out and 
you dont mind creating a dynamic sql query, you can pull the data 
out into columns like this example.  
*But* this requires you to know the columns you want before hand.
*/


WITH columns AS (
  SELECT
    generate_series::DATE dte
  FROM GENERATE_SERIES('2022-01-01'::DATE, '2022-12-31', '1 month')
)
, data AS (
  SELECT
      columns.dte
    , SUM(COALESCE(invoice.amount, 0)) total
  FROM columns
  LEFT OUTER
    JOIN invoice
    ON DATE_PART('year', invoice_date) = DATE_PART('year', columns.dte)
    AND DATE_PART('month', invoice_date) = DATE_PART('month', columns.dte)
  GROUP BY
      columns.dte
), result AS (
SELECT
  JSONB_OBJECT_AGG(TO_CHAR(dte, 'YYYY-MM'), total) pivotData
FROM
  data
)
SELECT
    (pivotData->>'2022-01') "jan"
  , (pivotData->>'2022-02') "feb"
  , (pivotData->>'2022-03') "mar"
  , (pivotData->>'2022-04') "apr"
  , (pivotData->>'2022-05') "may"
  , (pivotData->>'2022-06') "jun"
  , (pivotData->>'2022-07') "jul"
  , (pivotData->>'2022-08') "aug"
  , (pivotData->>'2022-09') "sep"
  , (pivotData->>'2022-10') "oct"
  , (pivotData->>'2022-11') "nov"
  , (pivotData->>'2022-12') "dec"

FROM
  result
  ;

jan   | feb   | mar | apr  | may  | jun   | jul   | aug  | sep   | oct   | nov   | dec  
:---- | :---- | :-- | :--- | :--- | :---- | :---- | :--- | :---- | :---- | :---- | :----
22030 | 22753 | 0   | 9456 | 7798 | 38278 | 18736 | 6794 | 21033 | 28576 | 10172 | 41901


/*

Now for a more complicated example, lets say you want to do the same thing,
but you want to get totals for each customer.  You just need to pull 
that grouped data along for the ride.

Your "columns" query needs to create a column for each month, plus the data 
you're grouping by. We use a cross join here with a list of all of our 
customers, again so that if we are missing data for one month for a customer 
we will still have that column in our output.
*/

SELECT
    generate_series dte
  , customer_id
FROM generate_series('2022-01-01'::date, '2022-12-31', '1 month')
CROSS JOIN
  (
    SELECT DISTINCT
      customer_id
    FROM
      invoice
  ) AS customers

dte                    | customer_id
:--------------------- | ----------:
2022-01-01 00:00:00+00 |           2
2022-01-01 00:00:00+00 |           1
2022-02-01 00:00:00+00 |           2
2022-02-01 00:00:00+00 |           1
2022-03-01 00:00:00+00 |           2
2022-03-01 00:00:00+00 |           1
2022-04-01 00:00:00+01 |           2
2022-04-01 00:00:00+01 |           1
2022-05-01 00:00:00+01 |           2
2022-05-01 00:00:00+01 |           1
2022-06-01 00:00:00+01 |           2
2022-06-01 00:00:00+01 |           1
2022-07-01 00:00:00+01 |           2
2022-07-01 00:00:00+01 |           1
2022-08-01 00:00:00+01 |           2
2022-08-01 00:00:00+01 |           1
2022-09-01 00:00:00+01 |           2
2022-09-01 00:00:00+01 |           1
2022-10-01 00:00:00+01 |           2
2022-10-01 00:00:00+01 |           1
2022-11-01 00:00:00+00 |           2
2022-11-01 00:00:00+00 |           1
2022-12-01 00:00:00+00 |           2
2022-12-01 00:00:00+00 |           1


/*
Now we need to group our "data" query by the grouping column and 
sum for each "column".

You can see some examples this time of months with no money.
*/

WITH columns AS (
  SELECT
       generate_series dte
     , customer_id
  FROM generate_series('2022-01-01'::date, '2022-12-31', '1 month')
  CROSS JOIN
    (
      SELECT DISTINCT
        customer_id
      FROM
        invoice
    ) AS customers
)
SELECT
    columns.dte
  , columns.customer_id
  , SUM(COALESCE(invoice.amount, 0)) total
FROM columns
LEFT OUTER
  JOIN invoice
  ON DATE_PART('year', invoice_date) = DATE_PART('year', columns.dte)
  AND DATE_PART('month', invoice_date) = DATE_PART('month', columns.dte)
  AND columns.customer_id = invoice.customer_id
GROUP BY
    columns.dte
  , columns.customer_id;

dte                    | customer_id | total
:--------------------- | ----------: | ----:
2022-12-01 00:00:00+00 |           1 | 14447
2022-04-01 00:00:00+01 |           2 |  4057
2022-09-01 00:00:00+01 |           1 |    14
2022-03-01 00:00:00+00 |           1 |     0
2022-03-01 00:00:00+00 |           2 |     0
2022-05-01 00:00:00+01 |           1 |     0
2022-06-01 00:00:00+01 |           2 | 23457
2022-01-01 00:00:00+00 |           2 | 22030
2022-12-01 00:00:00+00 |           2 | 27454
2022-10-01 00:00:00+01 |           2 | 13110
2022-08-01 00:00:00+01 |           2 |  6794
2022-08-01 00:00:00+01 |           1 |     0
2022-02-01 00:00:00+00 |           1 | 10170
2022-02-01 00:00:00+00 |           2 | 12583
2022-10-01 00:00:00+01 |           1 | 15466
2022-11-01 00:00:00+00 |           1 |  3675
2022-05-01 00:00:00+01 |           2 |  7798
2022-01-01 00:00:00+00 |           1 |     0
2022-06-01 00:00:00+01 |           1 | 14821
2022-04-01 00:00:00+01 |           1 |  5399
2022-07-01 00:00:00+01 |           2 | 10809
2022-09-01 00:00:00+01 |           2 | 21019
2022-07-01 00:00:00+01 |           1 |  7927
2022-11-01 00:00:00+00 |           2 |  6497


/*
Now we do our final aggregation, passing our "column" heading as the first arg, 
the "cell" as the second arg, and the grouping by our customer_id.

Again if your application can handle the json data then you may be done, but if 
you want to turn it back into columns see the next example.
*/

WITH columns AS (
  SELECT
       generate_series dte
     , customer_id
  FROM generate_series('2022-01-01'::date, '2022-12-31', '1 month')
  CROSS JOIN
    (
      SELECT DISTINCT
        customer_id
      FROM
        invoice
    ) AS customers
), data AS (
  SELECT
      columns.dte
    , columns.customer_id
    , SUM(COALESCE(invoice.amount, 0)) total
  FROM columns
  LEFT OUTER
    JOIN invoice
    ON DATE_PART('year', invoice_date) = DATE_PART('year', columns.dte)
    AND DATE_PART('month', invoice_date) = DATE_PART('month', columns.dte)
    AND columns.customer_id = invoice.customer_id
  GROUP BY
      columns.dte
    , columns.customer_id
)
SELECT
    customer_id
  , JSONB_OBJECT_AGG(TO_CHAR(dte, 'YYYY-MM'), total) pivotData
FROM
  data
GROUP BY
  customer_id
;

customer_id | pivotdata                                                                                                                                                                                                       
----------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
          2 | {"2022-01": 22030, "2022-02": 12583, "2022-03": 0, "2022-04": 4057, "2022-05": 7798, "2022-06": 23457, "2022-07": 10809, "2022-08": 6794, "2022-09": 21019, "2022-10": 13110, "2022-11": 6497, "2022-12": 27454}
          1 | {"2022-01": 0, "2022-02": 10170, "2022-03": 0, "2022-04": 5399, "2022-05": 0, "2022-06": 14821, "2022-07": 7927, "2022-08": 0, "2022-09": 14, "2022-10": 15466, "2022-11": 3675, "2022-12": 14447}              


/*
And here is our final output back as columns.
*/
WITH columns AS (
  SELECT
       generate_series dte
     , customer_id
  FROM generate_series('2022-01-01'::date, '2022-12-31', '1 month')
  CROSS JOIN
    (
      SELECT DISTINCT
        customer_id
      FROM
        invoice
    ) AS customers
), data AS (
  SELECT
      columns.dte
    , columns.customer_id
    , SUM(COALESCE(invoice.amount, 0)) total
  FROM columns
  LEFT OUTER
    JOIN invoice
    ON DATE_PART('year', invoice_date) = DATE_PART('year', columns.dte)
    AND DATE_PART('month', invoice_date) = DATE_PART('month', columns.dte)
    AND columns.customer_id = invoice.customer_id
  GROUP BY
      columns.dte
    , columns.customer_id
), result AS (
  SELECT
    customer_id
    , JSONB_OBJECT_AGG(TO_CHAR(dte, 'YYYY-MM'), total) pivotData
  FROM
    data
  GROUP BY
    customer_id
)
SELECT
    customer_id
  , (pivotData->>'2022-01') "jan"
  , (pivotData->>'2022-02') "feb"
  , (pivotData->>'2022-03') "mar"
  , (pivotData->>'2022-04') "apr"
  , (pivotData->>'2022-05') "may"
  , (pivotData->>'2022-06') "jun"
  , (pivotData->>'2022-07') "jul"
  , (pivotData->>'2022-08') "aug"
  , (pivotData->>'2022-09') "sep"
  , (pivotData->>'2022-10') "oct"
  , (pivotData->>'2022-11') "nov"
  , (pivotData->>'2022-12') "dec"
FROM
  result
ORDER BY customer_id

customer_id | jan   | feb   | mar | apr  | may  | jun   | jul   | aug  | sep   | oct   | nov  | dec  
----------: | :---- | :---- | :-- | :--- | :--- | :---- | :---- | :--- | :---- | :---- | :--- | :----
          1 | 0     | 10170 | 0   | 5399 | 0    | 14821 | 7927  | 0    | 14    | 15466 | 3675 | 14447
          2 | 22030 | 12583 | 0   | 4057 | 7798 | 23457 | 10809 | 6794 | 21019 | 13110 | 6497 | 27454


db<>fiddle here