fphilipe/_setup.sql

## _setup.sql
-- Create users table with username as PK.
CREATE TABLE users_no_id (username varchar primary key, age int);
-- Create posts table with reference to users table using its PK and index on the reference.
CREATE TABLE posts_no_id (id serial primary key, username varchar references users_no_id (username), body text);
CREATE INDEX ON posts_no_id (username);
-- Create 1000000 users:
INSERT INTO users_no_id (username, age) SELECT substring(md5(random()::text) || md5(random()::text), (random()*32)::int, 16 + (random()*16)::int), 18 + (random()*70)::int FROM generate_series(1, 1000000);
-- Create 200000 posts for random users (multiple queries so users can have mutliple posts).
INSERT INTO posts_no_id (username, body) SELECT username, md5(random()::text) || md5(random()::text) FROM users_no_id ORDER BY random() LIMIT 25000;
INSERT INTO posts_no_id (username, body) SELECT username, md5(random()::text) || md5(random()::text) FROM users_no_id ORDER BY random() LIMIT 25000;
INSERT INTO posts_no_id (username, body) SELECT username, md5(random()::text) || md5(random()::text) FROM users_no_id ORDER BY random() LIMIT 25000;
INSERT INTO posts_no_id (username, body) SELECT username, md5(random()::text) || md5(random()::text) FROM users_no_id ORDER BY random() LIMIT 25000;

-- Create users table with id as PK and a unique index on username.
CREATE TABLE users_with_id (id serial primary key, username varchar, age int);
CREATE UNIQUE INDEX ON users_with_id (username);
-- Create posts table with reference to users table using its PK and index on the reference.
CREATE TABLE posts_with_id (id serial primary key, user_id integer references users_with_id (id), body text);
CREATE INDEX ON posts_with_id (user_id);
-- Create 1000000 users:
INSERT INTO users_with_id (username, age) SELECT substring(md5(random()::text) || md5(random()::text), (random()*32)::int, 16 + (random()*16)::int), 18 + (random()*70)::int FROM generate_series(1, 1000000);
-- Create 200000 posts for random users (multiple queries so users can have mutliple posts).
INSERT INTO posts_with_id (user_id, body) SELECT id, md5(random()::text) || md5(random()::text) FROM users_with_id ORDER BY random() LIMIT 25000;
INSERT INTO posts_with_id (user_id, body) SELECT id, md5(random()::text) || md5(random()::text) FROM users_with_id ORDER BY random() LIMIT 25000;
INSERT INTO posts_with_id (user_id, body) SELECT id, md5(random()::text) || md5(random()::text) FROM users_with_id ORDER BY random() LIMIT 25000;
INSERT INTO posts_with_id (user_id, body) SELECT id, md5(random()::text) || md5(random()::text) FROM users_with_id ORDER BY random() LIMIT 25000;

## analysis.md

      
    Raw
  

              analysis.md
            
          
    After running the above, the tables look as follows (\dt+):
                      List of relations
 Schema |     Name      | Type  | Owner | Size  | Description
--------+---------------+-------+-------+-------+-------------
 public | users_no_id   | table | phil  | 58 MB |
 public | posts_no_id   | table | phil  | 12 MB |
 public | users_with_id | table | phil  | 62 MB |
 public | posts_with_id | table | phil  | 10 MB |

The sum of the tables:

no_id total: 70MB
with_id total: 72MB

The indexes (\di+):
                                      List of relations
 Schema |            Name            | Type  | Owner |     Table     |  Size   | Description
--------+----------------------------+-------+-------+---------------+---------+-------------
 public | users_no_id_pkey           | index | phil  | users_no_id   | 57 MB   |
 public | posts_no_id_pkey           | index | phil  | posts_no_id   | 2208 kB |
 public | posts_no_id_username_idx   | index | phil  | posts_no_id   | 4480 kB |
 public | users_with_id_pkey         | index | phil  | users_with_id | 21 MB   |
 public | users_with_id_username_idx | index | phil  | users_with_id | 44 MB   |
 public | posts_with_id_pkey         | index | phil  | posts_with_id | 2208 kB |
 public | posts_with_id_user_id_idx  | index | phil  | posts_with_id | 2808 kB |

The sum of indexes:

no_id total: 64MB
with_id total: 66MB


After this, I increased the posts count to 1000000, leading to the following:
Tables:
                       List of relations
 Schema |     Name      | Type  | Owner |  Size  | Description
--------+---------------+-------+-------+--------+-------------
 public | users_no_id   | table | phil  | 58 MB  |
 public | posts_no_id   | table | phil  | 103 MB |
 public | users_with_id | table | phil  | 62 MB  |
 public | posts_with_id | table | phil  | 89 MB  |


no_id total: 161MB
with_id total: 151MB

Indexes:
 Schema |            Name            | Type  | Owner |     Table     | Size  | Description
--------+----------------------------+-------+-------+---------------+-------+-------------
 public | users_no_id_pkey           | index | phil  | users_no_id   | 57 MB |
 public | posts_no_id_pkey           | index | phil  | posts_no_id   | 18 MB |
 public | posts_no_id_username_idx   | index | phil  | posts_no_id   | 48 MB |
 public | users_with_id_pkey         | index | phil  | users_with_id | 21 MB |
 public | users_with_id_username_idx | index | phil  | users_with_id | 44 MB |
 public | posts_with_id_pkey         | index | phil  | posts_with_id | 18 MB |
 public | posts_with_id_user_id_idx  | index | phil  | posts_with_id | 24 MB |


no_id total: 123MB
with_id total: 107MB


With lower row counts the no id approach uses less data, also because the id approach requires an additional unique index. Once row count gets bigger, the id approach uses more data, but still not as much as I had expected.
E.g. the index on the foreign key for the no id approach is only double the size of the id appraoch. Also, the posts table in the no id appraoch stores the full username on each row, but is still only ~16% bigger.
	-- Create users table with username as PK.
	CREATE TABLE users_no_id (username varchar primary key, age int);
	-- Create posts table with reference to users table using its PK and index on the reference.
	CREATE TABLE posts_no_id (id serial primary key, username varchar references users_no_id (username), body text);
	CREATE INDEX ON posts_no_id (username);
	-- Create 1000000 users:
	INSERT INTO users_no_id (username, age) SELECT substring(md5(random()::text) \|\| md5(random()::text), (random()32)::int, 16 + (random()16)::int), 18 + (random()*70)::int FROM generate_series(1, 1000000);
	-- Create 200000 posts for random users (multiple queries so users can have mutliple posts).
	INSERT INTO posts_no_id (username, body) SELECT username, md5(random()::text) \|\| md5(random()::text) FROM users_no_id ORDER BY random() LIMIT 25000;
	INSERT INTO posts_no_id (username, body) SELECT username, md5(random()::text) \|\| md5(random()::text) FROM users_no_id ORDER BY random() LIMIT 25000;
	INSERT INTO posts_no_id (username, body) SELECT username, md5(random()::text) \|\| md5(random()::text) FROM users_no_id ORDER BY random() LIMIT 25000;
	INSERT INTO posts_no_id (username, body) SELECT username, md5(random()::text) \|\| md5(random()::text) FROM users_no_id ORDER BY random() LIMIT 25000;

	-- Create users table with id as PK and a unique index on username.
	CREATE TABLE users_with_id (id serial primary key, username varchar, age int);
	CREATE UNIQUE INDEX ON users_with_id (username);
	-- Create posts table with reference to users table using its PK and index on the reference.
	CREATE TABLE posts_with_id (id serial primary key, user_id integer references users_with_id (id), body text);
	CREATE INDEX ON posts_with_id (user_id);
	-- Create 1000000 users:
	INSERT INTO users_with_id (username, age) SELECT substring(md5(random()::text) \|\| md5(random()::text), (random()32)::int, 16 + (random()16)::int), 18 + (random()*70)::int FROM generate_series(1, 1000000);
	-- Create 200000 posts for random users (multiple queries so users can have mutliple posts).
	INSERT INTO posts_with_id (user_id, body) SELECT id, md5(random()::text) \|\| md5(random()::text) FROM users_with_id ORDER BY random() LIMIT 25000;
	INSERT INTO posts_with_id (user_id, body) SELECT id, md5(random()::text) \|\| md5(random()::text) FROM users_with_id ORDER BY random() LIMIT 25000;
	INSERT INTO posts_with_id (user_id, body) SELECT id, md5(random()::text) \|\| md5(random()::text) FROM users_with_id ORDER BY random() LIMIT 25000;
	INSERT INTO posts_with_id (user_id, body) SELECT id, md5(random()::text) \|\| md5(random()::text) FROM users_with_id ORDER BY random() LIMIT 25000;