nroi/postgresql-etl.sql

## postgresql-etl.sql
-- Various bits and pieces of code that turned out to be useful for the 'Load' step of ETL
-- (i.e., loading big chunks of data into postgres tables).

-- postgres' COPY is more efficient than a bunch of INSERT statements:
\copy table_name (col1, col2, …) from '/path/to/file.txt'

-- If the text file to load the data from is unusually large, it can also be unzipped. Note that, regardless of which of the
-- two methods is used (loading directly from a file or unzipping it first with zcat), this will *not* load the entire file
-- into memory, so this method can also be used for very large files even if both server and client have little memory available.
\copy table_name (col1, col2, …) from program 'zcat /path/to/file.txt.gz'

-- Even though COPY will not load the entire file into memory, memory usage will increase with the size of the file.
-- One simple method to avoid the high memory usage is to split the text file into smaller files, and then use COPY on
-- each of the smaller file. Again, we can use "\copy […] from program" to avoid actually having to write the chunks onto disk.
-- Notice that this only works if our columns contain no newlines, or only escaped newlines.
-- Note that the following is shell script, not SQL:
split -l1000000 --filter="psql -c '\copy table_name (col1, col2, …)  from stdin'" /path/to/file.txt

-- Obviously, this method can also be used with compressed files:
zcat /path/to/file.txt.gz | split -l1000000 --filter="psql -c '\copy table_name (col1, col2, …)  from stdin'"

-- With ETL and Data Warehousing, you might end up with the following scenario: The table from the source database is too large
-- to be copied regularly to the target database. Existing rows may change, so you can't just copy those rows from the source
-- database which have been newly added since the last synchronization. You may, however, assume that rows with an ID smaller
-- than a given threshold value will not change on the source database. So instead of copying the entire table from source
-- to target, you copy only those rows whose ID is equal to or larger than the threshold value. You can then copy the text file
-- into a temporary table and let postgres figure out which of the rows have actually changed and should therefore be inserted
-- into the actual table:

begin;
-- for the unlikely event that some rows have been deleted in the source
-- database, we need to delete those rows as well.
delete from table_name
where id in (
    select id from (
        select id, col1, col2, … from table_name where id >= (select min(id) from table_name_tmp)
        except
        select id, col1, col2, … from table_name_tmp
    ) deleted_ids
);
-- Next, we enter the rows which have changed:
insert into table_name (id, col1, col2, …)
select id, col1, col2, … from (
    select id, col1, col2, … from table_name_tmp
    except
    select id, col1, col2, … from table_name where id >= (select min(id) from table_name_tmp)
) diff;
truncate table table_name_tmp;
commit;

-- Of course, it would be easier to simply delete all rows of the table from the target database where the id is larger than
-- or equal to the threshold id. We could then COPY the text file directly into the target table instead of creating a temporary
-- table first. However, the approach described above has two advantages:
--    1. New rows are inserted with a normal INSERT operation instead of COPY, which means that triggers will be called.
--    2. If many indexes exist on the target table, it might be faster to first insert everything into a temporary table (without
--       any indexes), and then insert only a small fraction of the rows of the temporary table into the target table.
	-- Various bits and pieces of code that turned out to be useful for the 'Load' step of ETL
	-- (i.e., loading big chunks of data into postgres tables).

	-- postgres' COPY is more efficient than a bunch of INSERT statements:
	\copy table_name (col1, col2, …) from '/path/to/file.txt'

	-- If the text file to load the data from is unusually large, it can also be unzipped. Note that, regardless of which of the
	-- two methods is used (loading directly from a file or unzipping it first with zcat), this will not load the entire file
	-- into memory, so this method can also be used for very large files even if both server and client have little memory available.
	\copy table_name (col1, col2, …) from program 'zcat /path/to/file.txt.gz'

	-- Even though COPY will not load the entire file into memory, memory usage will increase with the size of the file.
	-- One simple method to avoid the high memory usage is to split the text file into smaller files, and then use COPY on
	-- each of the smaller file. Again, we can use "\copy […] from program" to avoid actually having to write the chunks onto disk.
	-- Notice that this only works if our columns contain no newlines, or only escaped newlines.
	-- Note that the following is shell script, not SQL:
	split -l1000000 --filter="psql -c '\copy table_name (col1, col2, …) from stdin'" /path/to/file.txt

	-- Obviously, this method can also be used with compressed files:
	zcat /path/to/file.txt.gz \| split -l1000000 --filter="psql -c '\copy table_name (col1, col2, …) from stdin'"

	-- With ETL and Data Warehousing, you might end up with the following scenario: The table from the source database is too large
	-- to be copied regularly to the target database. Existing rows may change, so you can't just copy those rows from the source
	-- database which have been newly added since the last synchronization. You may, however, assume that rows with an ID smaller
	-- than a given threshold value will not change on the source database. So instead of copying the entire table from source
	-- to target, you copy only those rows whose ID is equal to or larger than the threshold value. You can then copy the text file
	-- into a temporary table and let postgres figure out which of the rows have actually changed and should therefore be inserted
	-- into the actual table:

	begin;
	-- for the unlikely event that some rows have been deleted in the source
	-- database, we need to delete those rows as well.
	delete from table_name
	where id in (
	select id from (
	select id, col1, col2, … from table_name where id >= (select min(id) from table_name_tmp)
	except
	select id, col1, col2, … from table_name_tmp
	) deleted_ids
	);
	-- Next, we enter the rows which have changed:
	insert into table_name (id, col1, col2, …)
	select id, col1, col2, … from (
	select id, col1, col2, … from table_name_tmp
	except
	select id, col1, col2, … from table_name where id >= (select min(id) from table_name_tmp)
	) diff;
	truncate table table_name_tmp;
	commit;

	-- Of course, it would be easier to simply delete all rows of the table from the target database where the id is larger than
	-- or equal to the threshold id. We could then COPY the text file directly into the target table instead of creating a temporary
	-- table first. However, the approach described above has two advantages:
	-- 1. New rows are inserted with a normal INSERT operation instead of COPY, which means that triggers will be called.
	-- 2. If many indexes exist on the target table, it might be faster to first insert everything into a temporary table (without
	-- any indexes), and then insert only a small fraction of the rows of the temporary table into the target table.