Skip to content

Instantly share code, notes, and snippets.

@sovetnik
Last active October 30, 2017 22:57
Show Gist options
  • Save sovetnik/18bbade85205667d6e8b881caa28df30 to your computer and use it in GitHub Desktop.
Save sovetnik/18bbade85205667d6e8b881caa28df30 to your computer and use it in GitHub Desktop.
Data and query to select authors which not collaborate with others
-- Create authors
CREATE TABLE authors(id serial PRIMARY KEY, name char(255));
-- Single author of many books
INSERT INTO authors(name)
VALUES ('William Gibson');
-- Collective of authors of one book
INSERT INTO authors(name)
VALUES ('Erich Gamma'), ('Richard Helm'), ('Ralph Johnson'), ('John Vlissides ');
-- Main author and collaborator
INSERT INTO authors(name)
VALUES ('Daniel Kahneman'), ('Amos Twersky');
-- Create books
CREATE TABLE books(id serial PRIMARY KEY, title char(255));
-- William Gibson books
INSERT INTO books(title)
VALUES('Neuromancer'), ('Count Zero'), ('Mona Lisa Overdrive');
-- GoF book
INSERT INTO books(title)
VALUES('Design Patterns: Elements of Reusable Object-Oriented Software');
-- Only Kahneman
INSERT INTO books(title)
VALUES('Thinking, Fast and Slow');
-- Kahneman and Tversky as coauthor
INSERT INTO books(title)
VALUES('Choices, Values and Frames');
-- Add authors and books relations
CREATE TABLE book_authors(
author_id int REFERENCES authors(id),
book_id int REFERENCES books(id),
main_author boolean
);
-- with constraints
-- book can be written only once
CREATE UNIQUE INDEX authorship_idx
ON book_authors(author_id, book_id);
-- book can have only one main author
CREATE UNIQUE INDEX main_author_idx
ON book_authors(book_id, main_author)
WHERE main_author = TRUE;
--and data
INSERT INTO book_authors(author_id, book_id, main_author)
VALUES (1,1, FALSE), (1,2, FALSE), (1,3, FALSE),
(2,4, FALSE), (3,4, FALSE), (4,4, FALSE), (5,4, FALSE),
(6,5, FALSE), (6,6, TRUE), (7,6, FALSE);
-- Query to get all authors not seen in collaboration
EXPLAIN WITH
books_aggregate(book_id, authors_count) as(
SELECT book_id, count(author_id) as authors_count
FROM book_authors
GROUP BY book_id
ORDER BY book_id
),
collective_books(id) as (
SELECT book_id
FROM books_aggregate
WHERE authors_count > 1
),
collective_authors(id) as (
select ba.author_id
FROM book_authors ba
JOIN collective_books cb
ON ba.book_id = cb.id
)
SELECT *
FROM authors a
WHERE a.id
NOT IN (SELECT id FROM collective_authors);
/* +----------------------------------------------------------------------------------------+ */
/* | QUERY PLAN | */
/* |----------------------------------------------------------------------------------------| */
/* | Seq Scan on authors a (cost=126.40..137.27 rows=35 width=1028) | */
/* | Filter: (NOT (hashed SubPlan 4)) | */
/* | CTE books_aggregate | */
/* | -> Sort (cost=53.54..54.04 rows=200 width=12) | */
/* | Sort Key: book_authors.book_id | */
/* | -> HashAggregate (cost=43.90..45.90 rows=200 width=12) | */
/* | Group Key: book_authors.book_id | */
/* | -> Seq Scan on book_authors (cost=0.00..32.60 rows=2260 width=8) | */
/* | CTE collective_books | */
/* | -> CTE Scan on books_aggregate (cost=0.00..4.50 rows=67 width=4) | */
/* | Filter: (authors_count > 1) | */
/* | CTE collective_authors | */
/* | -> Hash Join (cost=2.18..50.82 rows=757 width=4) | */
/* | Hash Cond: (ba.book_id = cb.id) | */
/* | -> Seq Scan on book_authors ba (cost=0.00..32.60 rows=2260 width=8) | */
/* | -> Hash (cost=1.34..1.34 rows=67 width=4) | */
/* | -> CTE Scan on collective_books cb (cost=0.00..1.34 rows=67 width=4) | */
/* | SubPlan 4 | */
/* | -> CTE Scan on collective_authors (cost=0.00..15.14 rows=757 width=4) | */
/* +----------------------------------------------------------------------------------------+ */
CREATE INDEX book_idx ON book_authors(book_id);
-- Same query after ad index
/* +---------------------------------------------------------------------------------------+ */
/* | QUERY PLAN | */
/* |---------------------------------------------------------------------------------------| */
/* | Seq Scan on authors a (cost=3.00..13.87 rows=35 width=1028) | */
/* | Filter: (NOT (hashed SubPlan 4)) | */
/* | CTE books_aggregate | */
/* | -> GroupAggregate (cost=1.27..1.44 rows=10 width=12) | */
/* | Group Key: book_authors.book_id | */
/* | -> Sort (cost=1.27..1.29 rows=10 width=8) | */
/* | Sort Key: book_authors.book_id | */
/* | -> Seq Scan on book_authors (cost=0.00..1.10 rows=10 width=8) | */
/* | CTE collective_books | */
/* | -> CTE Scan on books_aggregate (cost=0.00..0.22 rows=3 width=4) | */
/* | Filter: (authors_count > 1) | */
/* | CTE collective_authors | */
/* | -> Hash Join (cost=0.10..1.26 rows=3 width=4) | */
/* | Hash Cond: (ba.book_id = cb.id) | */
/* | -> Seq Scan on book_authors ba (cost=0.00..1.10 rows=10 width=8) | */
/* | -> Hash (cost=0.06..0.06 rows=3 width=4) | */
/* | -> CTE Scan on collective_books cb (cost=0.00..0.06 rows=3 width=4) | */
/* | SubPlan 4 | */
/* | -> CTE Scan on collective_authors (cost=0.00..0.06 rows=3 width=4) | */
/* +---------------------------------------------------------------------------------------+ */
/* -- Export to csv */
/* COPY( */
/* WITH */
/* books_aggregate(book_id, authors_count) as( */
/* SELECT book_id, count(author_id) as authors_count */
/* FROM book_authors */
/* GROUP BY book_id */
/* ORDER BY book_id */
/* ), */
/* collective_books(id) as ( */
/* SELECT book_id */
/* FROM books_aggregate */
/* WHERE authors_count > 1 */
/* ), */
/* collective_authors(id) as ( */
/* select ba.author_id */
/* FROM book_authors ba */
/* JOIN collective_books cb */
/* ON ba.book_id = cb.id */
/* ) */
/* SELECT * */
/* FROM authors a */
/* WHERE a.id */
/* NOT IN (SELECT id FROM collective_authors) */
/* ) TO '/tmp/mustread.csv' WITH CSV HEADER; */
/* --Cleanup */
/* DROP TABLE book_authors; */
/* DROP TABLE authors; */
/* DROP TABLE books; */
/* DROP INDEX main_author_idx; */
/* DROP INDEX authorship_idx; */
/* DROP INDEX book_idx; */
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment