joevandyk/denormalize into arrays.sql Secret

## denormalize into arrays.sql
-- Coupons can be associated with one or more products.
--
-- Doing a join can be very slow on large datasets, especially
-- if the data distribution is sorta weird.
--
-- (Lots of data is fetched from both tables, and the join filters out the majority
-- of those rows)
--
-- In this example, we only care about coupons where the user_id is null.
--
-- https://gist.github.com/joevandyk/5914950 shows a trigger that can automatically denormalize the join table.

create table coupons (coupon_id serial primary key, user_id integer);
create table coupons_products(coupon_id integer references coupons, product_id integer, primary key (coupon_id, product_id));

-- insert data
insert into coupons (user_id) select * from generate_series(1, 1000000);
insert into coupons (user_id) select null from generate_series(1, 1000000);
insert into coupons_products select i, 1000 from generate_series(1, 500000)i;
insert into coupons_products select 1500000, 1000; -- the matching row in the test queries.

-- Create the indexes...
create index on coupons(coupon_id) where user_id is null;
create index on coupons_products(product_id, coupon_id);

-- Let's add an array of product_ids to the coupons
alter table coupons add column product_ids integer[];

-- and let's populate it.
with info as (select coupon_id, array_agg(product_id) product_ids from coupons_products group by coupon_id)
update coupons
  set product_ids = info.product_ids
  from info where info.coupon_id = coupons.coupon_id;

-- Create the index on coupons.product_ids[]
create index on coupons using gin(product_ids) where user_id is null;

analyze coupons;
analyze coupons_products;

-- Grabbing coupons with a product_id of 1000 with no user_id set.
-- Very slow, gets slower as data grows.
select coupons.coupon_id from coupons join coupons_products using (coupon_id) where coupons.user_id is null and product_id = 1000 group by coupon_id;

explain analyze
select coupons.coupon_id from coupons join coupons_products using (coupon_id) where coupons.user_id is null and product_id = 1000 group by coupon_id;

-- Again grabbing coupons with a product_id of 1000 with no user_id set.
-- Using the array columns this time.
-- 0.025ms! Doesn't increase in time as test result grows
select coupons.coupon_id from coupons where coupons.user_id is null and product_ids @> array[1000];

explain analyze
select coupons.coupon_id from coupons where coupons.user_id is null and product_ids @> array[1000];

## the analyze for arrays
 Bitmap Heap Scan on coupons  (cost=2473.31..18588.79 rows=247878 width=4) (actual time=0.005..0.006 rows=1 loops=1)
   Recheck Cond: ((product_ids @> '{1000}'::integer[]) AND (user_id IS NULL))
   ->  Bitmap Index Scan on coupons_product_ids_idx  (cost=0.00..2411.34 rows=247878 width=0) (actual time=0.004..0.004 rows=1 loops=1)
         Index Cond: (product_ids @> '{1000}'::integer[])
 Total runtime: 0.021 ms

## the analyze for join table
 Group  (cost=91112.70..92365.12 rows=250484 width=4) (actual time=1641.954..1641.954 rows=1 loops=1)
   ->  Sort  (cost=91112.70..91738.91 rows=250484 width=4) (actual time=1641.950..1641.950 rows=1 loops=1)
         Sort Key: coupons.coupon_id
         Sort Method: quicksort  Memory: 25kB
         ->  Hash Join  (cost=22387.19..65224.85 rows=250484 width=4) (actual time=1479.589..1641.938 rows=1 loops=1)
               Hash Cond: (coupons.coupon_id = coupons_products.coupon_id)
               ->  Bitmap Heap Scan on coupons  (cost=5720.16..28756.49 rows=1001933 width=4) (actual time=540.234..763.079 rows=1000000 loops=1)
                     Recheck Cond: (user_id IS NULL)
                     ->  Bitmap Index Scan on coupons_product_ids_idx  (cost=0.00..5469.68 rows=1001933 width=0) (actual time=539.499..539.499 rows=1000000 loops=1)
               ->  Hash  (cost=8463.01..8463.01 rows=500001 width=4) (actual time=244.732..244.732 rows=500001 loops=1)
                     Buckets: 4096  Batches: 32  Memory Usage: 561kB
                     ->  Seq Scan on coupons_products  (cost=0.00..8463.01 rows=500001 width=4) (actual time=0.016..124.053 rows=500001 loops=1)
                           Filter: (product_id = 1000)
 Total runtime: 1642.055 ms
	-- Coupons can be associated with one or more products.
	--
	-- Doing a join can be very slow on large datasets, especially
	-- if the data distribution is sorta weird.
	--
	-- (Lots of data is fetched from both tables, and the join filters out the majority
	-- of those rows)
	--
	-- In this example, we only care about coupons where the user_id is null.
	--
	-- https://gist.github.com/joevandyk/5914950 shows a trigger that can automatically denormalize the join table.

	create table coupons (coupon_id serial primary key, user_id integer);
	create table coupons_products(coupon_id integer references coupons, product_id integer, primary key (coupon_id, product_id));

	-- insert data
	insert into coupons (user_id) select * from generate_series(1, 1000000);
	insert into coupons (user_id) select null from generate_series(1, 1000000);
	insert into coupons_products select i, 1000 from generate_series(1, 500000)i;
	insert into coupons_products select 1500000, 1000; -- the matching row in the test queries.

	-- Create the indexes...
	create index on coupons(coupon_id) where user_id is null;
	create index on coupons_products(product_id, coupon_id);

	-- Let's add an array of product_ids to the coupons
	alter table coupons add column product_ids integer[];

	-- and let's populate it.
	with info as (select coupon_id, array_agg(product_id) product_ids from coupons_products group by coupon_id)
	update coupons
	set product_ids = info.product_ids
	from info where info.coupon_id = coupons.coupon_id;

	-- Create the index on coupons.product_ids[]
	create index on coupons using gin(product_ids) where user_id is null;

	analyze coupons;
	analyze coupons_products;

	-- Grabbing coupons with a product_id of 1000 with no user_id set.
	-- Very slow, gets slower as data grows.
	select coupons.coupon_id from coupons join coupons_products using (coupon_id) where coupons.user_id is null and product_id = 1000 group by coupon_id;

	explain analyze
	select coupons.coupon_id from coupons join coupons_products using (coupon_id) where coupons.user_id is null and product_id = 1000 group by coupon_id;

	-- Again grabbing coupons with a product_id of 1000 with no user_id set.
	-- Using the array columns this time.
	-- 0.025ms! Doesn't increase in time as test result grows
	select coupons.coupon_id from coupons where coupons.user_id is null and product_ids @> array[1000];

	explain analyze
	select coupons.coupon_id from coupons where coupons.user_id is null and product_ids @> array[1000];
	Bitmap Heap Scan on coupons (cost=2473.31..18588.79 rows=247878 width=4) (actual time=0.005..0.006 rows=1 loops=1)
	Recheck Cond: ((product_ids @> '{1000}'::integer[]) AND (user_id IS NULL))
	-> Bitmap Index Scan on coupons_product_ids_idx (cost=0.00..2411.34 rows=247878 width=0) (actual time=0.004..0.004 rows=1 loops=1)
	Index Cond: (product_ids @> '{1000}'::integer[])
	Total runtime: 0.021 ms
	Group (cost=91112.70..92365.12 rows=250484 width=4) (actual time=1641.954..1641.954 rows=1 loops=1)
	-> Sort (cost=91112.70..91738.91 rows=250484 width=4) (actual time=1641.950..1641.950 rows=1 loops=1)
	Sort Key: coupons.coupon_id
	Sort Method: quicksort Memory: 25kB
	-> Hash Join (cost=22387.19..65224.85 rows=250484 width=4) (actual time=1479.589..1641.938 rows=1 loops=1)
	Hash Cond: (coupons.coupon_id = coupons_products.coupon_id)
	-> Bitmap Heap Scan on coupons (cost=5720.16..28756.49 rows=1001933 width=4) (actual time=540.234..763.079 rows=1000000 loops=1)
	Recheck Cond: (user_id IS NULL)
	-> Bitmap Index Scan on coupons_product_ids_idx (cost=0.00..5469.68 rows=1001933 width=0) (actual time=539.499..539.499 rows=1000000 loops=1)
	-> Hash (cost=8463.01..8463.01 rows=500001 width=4) (actual time=244.732..244.732 rows=500001 loops=1)
	Buckets: 4096 Batches: 32 Memory Usage: 561kB
	-> Seq Scan on coupons_products (cost=0.00..8463.01 rows=500001 width=4) (actual time=0.016..124.053 rows=500001 loops=1)
	Filter: (product_id = 1000)
	Total runtime: 1642.055 ms