Created
May 28, 2019 20:23
-
-
Save mishadoff/0f60bdc1672e52b381f7c58dedc1d7fe to your computer and use it in GitHub Desktop.
Array GIN index
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Tinkering with postgres arrays | |
-- Part I: PREPARE | |
-- create table | |
create table movies ( | |
title text, | |
tags text[], | |
year int | |
); | |
-- check the table is there | |
select * from movies; | |
-- generate 100M records movies with random data (runs ~10 mins) | |
-- we need a lot of data to make sure postgres is not using RAM | |
with tags as ( | |
select array['comedy', 'drama', 'crime', 'history', 'thriller', | |
'biography', 'sport', 'horror', 'detective', 'fantasy'] arr | |
) | |
insert into movies (title, tags, year) | |
select | |
'Random Title #' || id, | |
-- take 3 random tags for each movie | |
(select array[ | |
arr[floor(random() * 10 + 1)], | |
arr[floor(random() * 10 + 1)], | |
arr[floor(random() * 10 + 1)] | |
] from tags where id = id), | |
-- trick where id = id forces new filter for each subselect | |
-- therefore it uses different array each time | |
(1980 + random() * 30)::int | |
from generate_series(1, 100000000) as t(id); | |
-- estimate table size | |
-- table occupies ~11GB | |
select pg_size_pretty(pg_total_relation_size('movies')); | |
-- add one more movie | |
insert into movies (title, tags, year) | |
values ('Ghost in the shell', '{"cyberpunk"}', 2018); | |
-- Part II: NO INDEX | |
--- How many movies? : 100M / 35s | |
select count(*), avg(year) from movies; | |
--- Show first 5 movies : 5 / 3s | |
select * from movies limit 5; | |
--- Find 'cyberpunk' movies | |
--- time 47s | |
select * from movies where 'cyberpunk' = ANY(tags); | |
-- slow because sequential scan is used | |
explain | |
select * from movies where 'cyberpunk' = ANY(tags); | |
-- Part III: DEFAULT INDEX | |
-- Create default index on array (~15mins) | |
create index movies_tags_idx1 on movies(tags); | |
--- Find 'cyberpunk' movies (ANY) | |
--- time 69s | |
select * from movies where 'cyberpunk' = ANY(tags); | |
--- Find 'cyberpunk' movies (CONTAINS) | |
--- time 81s | |
select * from movies where tags @> array['cyberpunk']; | |
--- Find 'cyberpunk'-only movies (EXACT MATCH) | |
--- time 1ms | |
select * from movies where tags = array['cyberpunk']; | |
--- Add new cyberpunk movie | |
insert into movies (title, tags, year) | |
values ('Bladerunner', '{"cyberpunk", "action"}', 2018); | |
-- Exact match won't return it :( | |
select * from movies where tags = array['cyberpunk']; | |
-- default index is not suitable for arrays | |
drop index movies_tags_idx1; | |
-- Part IV: GIN INDEX | |
-- GIN is for Generalized INverted INdex (~11min) | |
create index movies_tags_idx2 on movies using GIN(tags); | |
--- Exact match is fast | |
--- time 34ms | |
select * from movies where tags = array['cyberpunk']; | |
--- Contains is fast | |
--- time 15ms | |
select * from movies where tags @> array['cyberpunk']; | |
--- ANY is still slow | |
--- time 47s | |
select * from movies where 'cyberpunk' = ANY(tags); | |
-- Conclusion | |
-- Use GIN index for arrays | |
-- Use @> for contains condition |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment