joevandyk/trigger for denormalizing joins into array column.sql

## trigger for denormalizing joins into array column.sql
/* This demonstrates a trigger for denormalizing a join table into an array.
   Hasn't really been tested much, but seems to work.

   Will put it into production tomorrow and see!

   Maybe PostgreSQL 9.4 will come with arrays of foreign keys. That will remove a lot of the
   need for simple join tables.

   http://blog.2ndquadrant.com/postgresql-9-3-development-array-element-foreign-keys didn't
   make it into 9.3.
*/

set client_min_messages = 'warning';

set search_path = 'test';
drop schema if exists test cascade;
create schema test;

-- Not used by the trigger, but it's pretty handy, and is used by the test code.
-- Returns true if array is empty, false if array is not.
create or replace function array_empty(anyarray) returns boolean as $$
  select $1 = '{}'
$$ language sql immutable;

-- array_remove was added to postgresql 9.2.
-- calling this one array_erase so it won't conflict.
create or replace function array_erase(anyarray, anyelement) returns anyarray as $$
  select array(select v from unnest($1) g(v) where v <> $2)
$$ language sql;

-- Trigger function to handle the denormalization.
create function denormalize_join_into_array() returns trigger as $f$
declare
  -- TODO figure out better names for these variables
  v_foreign_table_name text        := TG_ARGV[0]; -- table that's being updated
  v_foreign_pkey text              := TG_ARGV[1]; -- column on the updated table that contains the primary key
  v_foreign_array_column_name text := TG_ARGV[2]; -- column on the updated table that stores the array
  v_source_fkey text               := TG_ARGV[3]; -- column on the source table that contains the foreign key to the table being updated
  v_source_column_name text        := TG_ARGV[4]; -- column on the source table that contains the value that will be inserted into the array
  should_update boolean            := false;
begin
  if TG_OP = 'UPDATE' then
    -- Only bother updating if there's been a change we care about.
    -- On updates, we delete from the old row and append to the new one.
    -- TODO figure out better way to do this.
    execute format('select $1.%1$I is distinct from $2.%1$I or $1.%2$I is distinct from $2.%2$I',
                   v_source_column_name, v_source_fkey) using new, old into should_update;
  end if;

  -- TODO figure out simple way to remove duplication in the update statements.
  if TG_OP = 'INSERT' or should_update then
    -- Append the new value to the array column.
    execute format('update %1$I set %2$I = array_append(%2$I, $1.%4$I) where $1.%6$I = %1$I.%5$I',
                   v_foreign_table_name, v_foreign_array_column_name, v_foreign_array_column_name,
                   v_source_column_name, v_foreign_pkey, v_source_fkey) using new;
  end if;

  if TG_OP = 'DELETE' or should_update then
    -- Remove the old value from the array column.
    execute format('update %1$I set %2$I = array_erase(%2$I, $1.%4$I) where $1.%6$I = %1$I.%5$I',
                   v_foreign_table_name, v_foreign_array_column_name, v_foreign_array_column_name,
                   v_source_column_name, v_foreign_pkey, v_source_fkey) using old;

  end if;
  return null;
end $f$ language plpgsql;

-- Define line_items <-> shipments relations.

create table line_items (
  id integer primary key,
  shipment_ids integer[]
);

create table shipments (
  id integer primary key,
  tracking_number text not null
);

create table line_items_shipments (
  line_item_id integer references line_items,
  shipment_id integer references shipments,
  primary key (line_item_id, shipment_id)
);

-- Define the denormalization trigger.
create trigger D after insert or update or delete on line_items_shipments
for each row execute procedure denormalize_join_into_array('line_items', 'id', 'shipment_ids', 'line_item_id', 'shipment_id');

insert into line_items select * from generate_series(1, 250000);
insert into shipments  select *, 'tracking number' from generate_series(1, 249990);
insert into line_items_shipments select id, id from shipments;
-- There's 10 line items that haven't been shipped.

-- We're gonna need an gin index.
create index on line_items using gin(shipment_ids);
analyze line_items; analyze shipments; analyze line_items_shipments;

-- Test Queries below.

-- Find unshipped line items via array. Much faster. Simpler, sexier query.
  explain analyze select count(*) from line_items li where array_empty(shipment_ids);
/* Aggregate  (cost=20.02..20.03 rows=1 width=0) (actual time=0.054..0.054 rows=1 loops=1)
   ->  Bitmap Heap Scan on line_items li  (cost=16.01..20.02 rows=1 width=0) (actual time=0.042..0.043 rows=10 loops=1)
         Recheck Cond: (shipment_ids = '{}'::integer[])
         ->  Bitmap Index Scan on line_items_shipment_ids_idx  (cost=0.00..16.01 rows=1 width=0) (actual time=0.021..0.021 rows=10 loops=1)
               Index Cond: (shipment_ids = '{}'::integer[])
 Total runtime: 0.099 ms */

-- Find unshipped line items via join. This is slow.
  explain analyze
  select count(*) from line_items li
  left join line_items_shipments lis on lis.line_item_id = li.id where lis.line_item_id is null;
/* Aggregate  (cost=18445.86..18445.87 rows=1 width=0) (actual time=382.830..382.830 rows=1 loops=1)
   ->  Hash Anti Join  (cost=7708.77..18445.83 rows=10 width=0) (actual time=264.776..382.814 rows=10 loops=1)
         Hash Cond: (li.id = lis.line_item_id)
         ->  Seq Scan on line_items li  (cost=0.00..5931.00 rows=250000 width=4) (actual time=32.664..86.894 rows=250000 loops=1)
         ->  Hash  (cost=3606.90..3606.90 rows=249990 width=4) (actual time=111.456..111.456 rows=249990 loops=1)
               Buckets: 4096  Batches: 16  Memory Usage: 560kB
               ->  Seq Scan on line_items_shipments lis  (cost=0.00..3606.90 rows=249990 width=4) (actual time=0.009..49.887 rows=249990 loops=1)
 Total runtime: 382.882 ms */


-- Tests to make sure that inserts, updates, and deletes to the join table work as expected.
select * from line_items where id <= 3;
update line_items_shipments set line_item_id = 1 where line_item_id = 2;
update line_items_shipments set shipment_id = 1  where line_item_id = 3;
select * from line_items where id <= 3;
delete from line_items_shipments where shipment_id = 1;
select * from line_items where id <= 3;
	/* This demonstrates a trigger for denormalizing a join table into an array.
	Hasn't really been tested much, but seems to work.

	Will put it into production tomorrow and see!

	Maybe PostgreSQL 9.4 will come with arrays of foreign keys. That will remove a lot of the
	need for simple join tables.

	http://blog.2ndquadrant.com/postgresql-9-3-development-array-element-foreign-keys didn't
	make it into 9.3.
	*/

	set client_min_messages = 'warning';

	set search_path = 'test';
	drop schema if exists test cascade;
	create schema test;

	-- Not used by the trigger, but it's pretty handy, and is used by the test code.
	-- Returns true if array is empty, false if array is not.
	create or replace function array_empty(anyarray) returns boolean as $$
	select $1 = '{}'
	$$ language sql immutable;

	-- array_remove was added to postgresql 9.2.
	-- calling this one array_erase so it won't conflict.
	create or replace function array_erase(anyarray, anyelement) returns anyarray as $$
	select array(select v from unnest($1) g(v) where v <> $2)
	$$ language sql;

	-- Trigger function to handle the denormalization.
	create function denormalize_join_into_array() returns trigger as $f$
	declare
	-- TODO figure out better names for these variables
	v_foreign_table_name text := TG_ARGV[0]; -- table that's being updated
	v_foreign_pkey text := TG_ARGV[1]; -- column on the updated table that contains the primary key
	v_foreign_array_column_name text := TG_ARGV[2]; -- column on the updated table that stores the array
	v_source_fkey text := TG_ARGV[3]; -- column on the source table that contains the foreign key to the table being updated
	v_source_column_name text := TG_ARGV[4]; -- column on the source table that contains the value that will be inserted into the array
	should_update boolean := false;
	begin
	if TG_OP = 'UPDATE' then
	-- Only bother updating if there's been a change we care about.
	-- On updates, we delete from the old row and append to the new one.
	-- TODO figure out better way to do this.
	execute format('select $1.%1$I is distinct from $2.%1$I or $1.%2$I is distinct from $2.%2$I',
	v_source_column_name, v_source_fkey) using new, old into should_update;
	end if;

	-- TODO figure out simple way to remove duplication in the update statements.
	if TG_OP = 'INSERT' or should_update then
	-- Append the new value to the array column.
	execute format('update %1$I set %2$I = array_append(%2$I, $1.%4$I) where $1.%6$I = %1$I.%5$I',
	v_foreign_table_name, v_foreign_array_column_name, v_foreign_array_column_name,
	v_source_column_name, v_foreign_pkey, v_source_fkey) using new;
	end if;

	if TG_OP = 'DELETE' or should_update then
	-- Remove the old value from the array column.
	execute format('update %1$I set %2$I = array_erase(%2$I, $1.%4$I) where $1.%6$I = %1$I.%5$I',
	v_foreign_table_name, v_foreign_array_column_name, v_foreign_array_column_name,
	v_source_column_name, v_foreign_pkey, v_source_fkey) using old;

	end if;
	return null;
	end $f$ language plpgsql;

	-- Define line_items <-> shipments relations.

	create table line_items (
	id integer primary key,
	shipment_ids integer[]
	);

	create table shipments (
	id integer primary key,
	tracking_number text not null
	);

	create table line_items_shipments (
	line_item_id integer references line_items,
	shipment_id integer references shipments,
	primary key (line_item_id, shipment_id)
	);

	-- Define the denormalization trigger.
	create trigger D after insert or update or delete on line_items_shipments
	for each row execute procedure denormalize_join_into_array('line_items', 'id', 'shipment_ids', 'line_item_id', 'shipment_id');

	insert into line_items select * from generate_series(1, 250000);
	insert into shipments select *, 'tracking number' from generate_series(1, 249990);
	insert into line_items_shipments select id, id from shipments;
	-- There's 10 line items that haven't been shipped.

	-- We're gonna need an gin index.
	create index on line_items using gin(shipment_ids);
	analyze line_items; analyze shipments; analyze line_items_shipments;

	-- Test Queries below.

	-- Find unshipped line items via array. Much faster. Simpler, sexier query.
	explain analyze select count(*) from line_items li where array_empty(shipment_ids);
	/* Aggregate (cost=20.02..20.03 rows=1 width=0) (actual time=0.054..0.054 rows=1 loops=1)
	-> Bitmap Heap Scan on line_items li (cost=16.01..20.02 rows=1 width=0) (actual time=0.042..0.043 rows=10 loops=1)
	Recheck Cond: (shipment_ids = '{}'::integer[])
	-> Bitmap Index Scan on line_items_shipment_ids_idx (cost=0.00..16.01 rows=1 width=0) (actual time=0.021..0.021 rows=10 loops=1)
	Index Cond: (shipment_ids = '{}'::integer[])
	Total runtime: 0.099 ms */

	-- Find unshipped line items via join. This is slow.
	explain analyze
	select count(*) from line_items li
	left join line_items_shipments lis on lis.line_item_id = li.id where lis.line_item_id is null;
	/* Aggregate (cost=18445.86..18445.87 rows=1 width=0) (actual time=382.830..382.830 rows=1 loops=1)
	-> Hash Anti Join (cost=7708.77..18445.83 rows=10 width=0) (actual time=264.776..382.814 rows=10 loops=1)
	Hash Cond: (li.id = lis.line_item_id)
	-> Seq Scan on line_items li (cost=0.00..5931.00 rows=250000 width=4) (actual time=32.664..86.894 rows=250000 loops=1)
	-> Hash (cost=3606.90..3606.90 rows=249990 width=4) (actual time=111.456..111.456 rows=249990 loops=1)
	Buckets: 4096 Batches: 16 Memory Usage: 560kB
	-> Seq Scan on line_items_shipments lis (cost=0.00..3606.90 rows=249990 width=4) (actual time=0.009..49.887 rows=249990 loops=1)
	Total runtime: 382.882 ms */


	-- Tests to make sure that inserts, updates, and deletes to the join table work as expected.
	select * from line_items where id <= 3;
	update line_items_shipments set line_item_id = 1 where line_item_id = 2;
	update line_items_shipments set shipment_id = 1 where line_item_id = 3;
	select * from line_items where id <= 3;
	delete from line_items_shipments where shipment_id = 1;
	select * from line_items where id <= 3;