mishadoff/pg_array_index.sql

## pg_array_index.sql
-- Tinkering with postgres arrays

-- Part I: PREPARE

-- create table
create table movies (
  title text,
  tags text[],
  year int
);

-- check the table is there
select * from movies;

-- generate 100M records movies with random data (runs ~10 mins)
-- we need a lot of data to make sure postgres is not using RAM
with tags as (
 select array['comedy', 'drama', 'crime', 'history', 'thriller',
              'biography', 'sport', 'horror', 'detective', 'fantasy'] arr
)

insert into movies (title, tags, year)
select
  'Random Title #' || id,
  -- take 3 random tags for each movie
  (select array[
     arr[floor(random() * 10 + 1)],
     arr[floor(random() * 10 + 1)],
     arr[floor(random() * 10 + 1)]
   ] from tags where id = id),
   -- trick where id = id forces new filter for each subselect
   -- therefore it uses different array each time
  (1980 + random() * 30)::int
from generate_series(1, 100000000) as t(id);

-- estimate table size
-- table occupies ~11GB
select pg_size_pretty(pg_total_relation_size('movies'));

-- add one more movie
insert into movies (title, tags, year)
values ('Ghost in the shell', '{"cyberpunk"}', 2018);


-- Part II: NO INDEX

--- How many movies? : 100M / 35s
select count(*), avg(year) from movies;

--- Show first 5 movies : 5 / 3s
select * from movies limit 5;

--- Find 'cyberpunk' movies
--- time 47s
select * from movies where 'cyberpunk' = ANY(tags);

-- slow because sequential scan is used
explain
select * from movies where 'cyberpunk' = ANY(tags);

-- Part III: DEFAULT INDEX

-- Create default index on array (~15mins)
create index movies_tags_idx1 on movies(tags);

--- Find 'cyberpunk' movies (ANY)
--- time 69s
select * from movies where 'cyberpunk' = ANY(tags);

--- Find 'cyberpunk' movies (CONTAINS)
--- time 81s
select * from movies where tags @> array['cyberpunk'];

--- Find 'cyberpunk'-only movies (EXACT MATCH)
--- time 1ms
select * from movies where tags = array['cyberpunk'];

--- Add new cyberpunk movie
insert into movies (title, tags, year)
values ('Bladerunner', '{"cyberpunk", "action"}', 2018);

-- Exact match won't return it :(
select * from movies where tags = array['cyberpunk'];

-- default index is not suitable for arrays
drop index movies_tags_idx1;

-- Part IV: GIN INDEX

-- GIN is for Generalized INverted INdex (~11min)
create index movies_tags_idx2 on movies using GIN(tags);

--- Exact match is fast
--- time 34ms
select * from movies where tags = array['cyberpunk'];

--- Contains is fast
--- time 15ms
select * from movies where tags @> array['cyberpunk'];

--- ANY is still slow
--- time 47s
select * from movies where 'cyberpunk' = ANY(tags);

-- Conclusion

-- Use GIN index for arrays
-- Use @> for contains condition
	-- Tinkering with postgres arrays

	-- Part I: PREPARE

	-- create table
	create table movies (
	title text,
	tags text[],
	year int
	);

	-- check the table is there
	select * from movies;

	-- generate 100M records movies with random data (runs ~10 mins)
	-- we need a lot of data to make sure postgres is not using RAM
	with tags as (
	select array['comedy', 'drama', 'crime', 'history', 'thriller',
	'biography', 'sport', 'horror', 'detective', 'fantasy'] arr
	)

	insert into movies (title, tags, year)
	select
	'Random Title #' \|\| id,
	-- take 3 random tags for each movie
	(select array[
	arr[floor(random() * 10 + 1)],
	arr[floor(random() * 10 + 1)],
	arr[floor(random() * 10 + 1)]
	] from tags where id = id),
	-- trick where id = id forces new filter for each subselect
	-- therefore it uses different array each time
	(1980 + random() * 30)::int
	from generate_series(1, 100000000) as t(id);

	-- estimate table size
	-- table occupies ~11GB
	select pg_size_pretty(pg_total_relation_size('movies'));

	-- add one more movie
	insert into movies (title, tags, year)
	values ('Ghost in the shell', '{"cyberpunk"}', 2018);


	-- Part II: NO INDEX

	--- How many movies? : 100M / 35s
	select count(*), avg(year) from movies;

	--- Show first 5 movies : 5 / 3s
	select * from movies limit 5;

	--- Find 'cyberpunk' movies
	--- time 47s
	select * from movies where 'cyberpunk' = ANY(tags);

	-- slow because sequential scan is used
	explain
	select * from movies where 'cyberpunk' = ANY(tags);

	-- Part III: DEFAULT INDEX

	-- Create default index on array (~15mins)
	create index movies_tags_idx1 on movies(tags);

	--- Find 'cyberpunk' movies (ANY)
	--- time 69s
	select * from movies where 'cyberpunk' = ANY(tags);

	--- Find 'cyberpunk' movies (CONTAINS)
	--- time 81s
	select * from movies where tags @> array['cyberpunk'];

	--- Find 'cyberpunk'-only movies (EXACT MATCH)
	--- time 1ms
	select * from movies where tags = array['cyberpunk'];

	--- Add new cyberpunk movie
	insert into movies (title, tags, year)
	values ('Bladerunner', '{"cyberpunk", "action"}', 2018);

	-- Exact match won't return it :(
	select * from movies where tags = array['cyberpunk'];

	-- default index is not suitable for arrays
	drop index movies_tags_idx1;

	-- Part IV: GIN INDEX

	-- GIN is for Generalized INverted INdex (~11min)
	create index movies_tags_idx2 on movies using GIN(tags);

	--- Exact match is fast
	--- time 34ms
	select * from movies where tags = array['cyberpunk'];

	--- Contains is fast
	--- time 15ms
	select * from movies where tags @> array['cyberpunk'];

	--- ANY is still slow
	--- time 47s
	select * from movies where 'cyberpunk' = ANY(tags);

	-- Conclusion

	-- Use GIN index for arrays
	-- Use @> for contains condition