Skip to content

Instantly share code, notes, and snippets.

@mishadoff
Created May 28, 2019 20:23
Show Gist options
  • Save mishadoff/0f60bdc1672e52b381f7c58dedc1d7fe to your computer and use it in GitHub Desktop.
Save mishadoff/0f60bdc1672e52b381f7c58dedc1d7fe to your computer and use it in GitHub Desktop.
Array GIN index
-- Tinkering with postgres arrays
-- Part I: PREPARE
-- create table
create table movies (
title text,
tags text[],
year int
);
-- check the table is there
select * from movies;
-- generate 100M records movies with random data (runs ~10 mins)
-- we need a lot of data to make sure postgres is not using RAM
with tags as (
select array['comedy', 'drama', 'crime', 'history', 'thriller',
'biography', 'sport', 'horror', 'detective', 'fantasy'] arr
)
insert into movies (title, tags, year)
select
'Random Title #' || id,
-- take 3 random tags for each movie
(select array[
arr[floor(random() * 10 + 1)],
arr[floor(random() * 10 + 1)],
arr[floor(random() * 10 + 1)]
] from tags where id = id),
-- trick where id = id forces new filter for each subselect
-- therefore it uses different array each time
(1980 + random() * 30)::int
from generate_series(1, 100000000) as t(id);
-- estimate table size
-- table occupies ~11GB
select pg_size_pretty(pg_total_relation_size('movies'));
-- add one more movie
insert into movies (title, tags, year)
values ('Ghost in the shell', '{"cyberpunk"}', 2018);
-- Part II: NO INDEX
--- How many movies? : 100M / 35s
select count(*), avg(year) from movies;
--- Show first 5 movies : 5 / 3s
select * from movies limit 5;
--- Find 'cyberpunk' movies
--- time 47s
select * from movies where 'cyberpunk' = ANY(tags);
-- slow because sequential scan is used
explain
select * from movies where 'cyberpunk' = ANY(tags);
-- Part III: DEFAULT INDEX
-- Create default index on array (~15mins)
create index movies_tags_idx1 on movies(tags);
--- Find 'cyberpunk' movies (ANY)
--- time 69s
select * from movies where 'cyberpunk' = ANY(tags);
--- Find 'cyberpunk' movies (CONTAINS)
--- time 81s
select * from movies where tags @> array['cyberpunk'];
--- Find 'cyberpunk'-only movies (EXACT MATCH)
--- time 1ms
select * from movies where tags = array['cyberpunk'];
--- Add new cyberpunk movie
insert into movies (title, tags, year)
values ('Bladerunner', '{"cyberpunk", "action"}', 2018);
-- Exact match won't return it :(
select * from movies where tags = array['cyberpunk'];
-- default index is not suitable for arrays
drop index movies_tags_idx1;
-- Part IV: GIN INDEX
-- GIN is for Generalized INverted INdex (~11min)
create index movies_tags_idx2 on movies using GIN(tags);
--- Exact match is fast
--- time 34ms
select * from movies where tags = array['cyberpunk'];
--- Contains is fast
--- time 15ms
select * from movies where tags @> array['cyberpunk'];
--- ANY is still slow
--- time 47s
select * from movies where 'cyberpunk' = ANY(tags);
-- Conclusion
-- Use GIN index for arrays
-- Use @> for contains condition
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment