JoshCheek/pg_diff_tables.rb

## pg_diff_tables.rb
# UPDATE: I've stopped putting updates into this gist
# as it's starting to get into the internal structure of our db
# Instead, you need to get the link to the private gist.
# You can ask me or Matt for it.

# -----------------------------


# CURRENT STATUS:
# * handles composite primary keys
# * handles case sensitive primary keys
# * handles 3 tables
#   * original: the common ancestor (aka parent, my naming is inconsistent in this gist)
#   * pcr:      one of the children, represents the data in the pcr schema
#   * prod:     the other child, represents the data in the public schema
# * identifies differences (in either table)
#   * rows added
#   * rows deleted
#   * rows modified
# * does not identify unchanged rows
#
# STILL TO DO:
# * emit these on a per-column basis instead of a per-row basis

# ===== SETUP =====
require 'pg'
db = PG.connect dbname: 'josh_testing'
db.exec 'begin'
def db.exec(*)
  super.to_a
end
def db.exec_params(*)
  super.to_a
end
# db.exec 'create extension hstore;'


# =====  FUNCTIONS =====
db.exec <<~SQL
-- https://wiki.postgresql.org/wiki/Retrieve_primary_key_columns
-- select (get_primary_keys('tablename')).*
create or replace function get_primary_keys(table_name text)
  returns table(col_name text, data_type text)
  as $$
    SELECT
      a.attname::text as col_name,
      format_type(a.atttypid, a.atttypmod) AS data_type
    FROM pg_index i
    JOIN pg_attribute a
      ON a.attrelid = i.indrelid
      AND a.attnum = ANY(i.indkey)
    WHERE  i.indrelid = table_name::regclass
    AND    i.indisprimary;
  $$ language sql;


create or replace function diff_table(orig_table text, pcr_table text, prod_table text)
  returns table(pks text, table_name text, column_name text, orig_value text, pcr_value text, prod_value text)
  as $fn$
  declare
    pks text := (select string_agg(quote_ident(col_name), ',')
                 from get_primary_keys(orig_table));
    query_body text := format($$
     select
       (%1$s)::text  as pks,
       %2$L::text    as table_name,
       'FIXME'::text as column_name,
       orig.val1      as orig_value,
       pcr.val1       as pcr_value,
       prod.val1      as prod_value
     from %2$I      as orig
     full join %3$I as pcr  using (%1$s)
     full join %4$I as prod using (%1$s)
     where pcr is distinct from prod
     $$,
     pks, orig_table, pcr_table, prod_table
   );
  begin
    -- raise notice 'QUERY: %', query_body;
    return query execute query_body;
    -- TODO:
    --   For each row in the above query,
    --   instead of returning that row,
    --   pass it to a "diff-records" fn
    --   and have that fn emit column-name,orig,pcr,prod
    --   for each differing column
  end $fn$ language plpgsql;
SQL

# ===== TEST 1 =====
db.exec <<~SQL
-- composite primary key with case sensitive column names
create table parent (id1 integer, "iD2" integer, val1 text, primary key (id1, "iD2"));
create table pcr    (id1 integer, "iD2" integer, val1 text, primary key (id1, "iD2"));
create table prod   (id1 integer, "iD2" integer, val1 text, primary key (id1, "iD2"));
insert into parent (id1, "iD2", val1)
  values (1, 1, 'unchanged'),
         (2, 1, 'deleted-pcr'),
         (2, 2, 'deleted-prod'),
         (2, 3, 'deleted-both'),
         (3, 1, 'modified-pcr'),
         (3, 2, 'modified-prod'),
         (3, 3, 'modified-both-same'),
         (3, 4, 'modified-both-different');
insert into pcr (id1, "iD2", val1)
  values (1, 1, 'unchanged'),
         (2, 2, 'deleted-prod'),
         (3, 1, 'MODIFIED-PCR'),
         (3, 2, 'modified-prod'),
         (3, 3, 'MODIFIED-BOTH-SAME'),
         (3, 4, 'MODIFIED-BOTH-DIFFERENT-PCR'),
         (4, 1, 'added-pcr');
insert into prod (id1, "iD2", val1)
  values (1, 1, 'unchanged'),
         (2, 1, 'deleted-pcr'),
         (3, 1, 'modified-pcr'),
         (3, 2, 'MODIFIED-PROD'),
         (3, 3, 'MODIFIED-BOTH-SAME'),
         (3, 4, 'MODIFIED-BOTH-DIFFERENT-PROD'),
         (4, 2, 'added-prod');
SQL
result       = db.exec("select * from diff_table('parent', 'pcr', 'prod')").sort_by { |r| r['pks'] }
EXPECTED_IDS = %w[(2,1) (2,2) (3,1) (3,2) (3,4) (4,1) (4,2)]
ACTUAL_IDS   = result.map { |row| row['pks'] }
headers      = result.first.to_h.keys
rows         = [
  headers,
  headers.map { |h| '-'*h.length },
  *result.map { |h|
    [ h["pks"],
      h["table_name"].inspect,
      h["column_name"].inspect,
      h["orig_value"].inspect,
      h["pcr_value"].inspect,
      h["prod_value"].inspect,
    ]
  }
]
format = rows.transpose.map { |col| col.map(&:length).max }.map { |n| "%-#{n}s" }.join(" ")
rows.each { |row| puts (format % row).strip }

if EXPECTED_IDS != ACTUAL_IDS
  EXPECTED_IDS # =>
  ACTUAL_IDS   # =>
  raise "EXPECTED: #{EXPECTED_IDS.inspect}\nACTUAL:   #{ACTUAL_IDS.inspect}"
end

# >> pks   table_name column_name orig_value                pcr_value                     prod_value
# >> ---   ---------- ----------- ----------                ---------                     ----------
# >> (2,1) "parent"   "FIXME"     "deleted-pcr"             nil                           "deleted-pcr"
# >> (2,2) "parent"   "FIXME"     "deleted-prod"            "deleted-prod"                nil
# >> (3,1) "parent"   "FIXME"     "modified-pcr"            "MODIFIED-PCR"                "modified-pcr"
# >> (3,2) "parent"   "FIXME"     "modified-prod"           "modified-prod"               "MODIFIED-PROD"
# >> (3,4) "parent"   "FIXME"     "modified-both-different" "MODIFIED-BOTH-DIFFERENT-PCR" "MODIFIED-BOTH-DIFFERENT-PROD"
# >> (4,1) "parent"   "FIXME"     nil                       "added-pcr"                   nil
# >> (4,2) "parent"   "FIXME"     nil                       nil                           "added-prod"
	# UPDATE: I've stopped putting updates into this gist
	# as it's starting to get into the internal structure of our db
	# Instead, you need to get the link to the private gist.
	# You can ask me or Matt for it.

	# -----------------------------


	# CURRENT STATUS:
	# * handles composite primary keys
	# * handles case sensitive primary keys
	# * handles 3 tables
	# * original: the common ancestor (aka parent, my naming is inconsistent in this gist)
	# * pcr: one of the children, represents the data in the pcr schema
	# * prod: the other child, represents the data in the public schema
	# * identifies differences (in either table)
	# * rows added
	# * rows deleted
	# * rows modified
	# * does not identify unchanged rows
	#
	# STILL TO DO:
	# * emit these on a per-column basis instead of a per-row basis

	# ===== SETUP =====
	require 'pg'
	db = PG.connect dbname: 'josh_testing'
	db.exec 'begin'
	def db.exec(*)
	super.to_a
	end
	def db.exec_params(*)
	super.to_a
	end
	# db.exec 'create extension hstore;'


	# ===== FUNCTIONS =====
	db.exec <<~SQL
	-- https://wiki.postgresql.org/wiki/Retrieve_primary_key_columns
	-- select (get_primary_keys('tablename')).*
	create or replace function get_primary_keys(table_name text)
	returns table(col_name text, data_type text)
	as $$
	SELECT
	a.attname::text as col_name,
	format_type(a.atttypid, a.atttypmod) AS data_type
	FROM pg_index i
	JOIN pg_attribute a
	ON a.attrelid = i.indrelid
	AND a.attnum = ANY(i.indkey)
	WHERE i.indrelid = table_name::regclass
	AND i.indisprimary;
	$$ language sql;


	create or replace function diff_table(orig_table text, pcr_table text, prod_table text)
	returns table(pks text, table_name text, column_name text, orig_value text, pcr_value text, prod_value text)
	as $fn$
	declare
	pks text := (select string_agg(quote_ident(col_name), ',')
	from get_primary_keys(orig_table));
	query_body text := format($$
	select
	(%1$s)::text as pks,
	%2$L::text as table_name,
	'FIXME'::text as column_name,
	orig.val1 as orig_value,
	pcr.val1 as pcr_value,
	prod.val1 as prod_value
	from %2$I as orig
	full join %3$I as pcr using (%1$s)
	full join %4$I as prod using (%1$s)
	where pcr is distinct from prod
	$$,
	pks, orig_table, pcr_table, prod_table
	);
	begin
	-- raise notice 'QUERY: %', query_body;
	return query execute query_body;
	-- TODO:
	-- For each row in the above query,
	-- instead of returning that row,
	-- pass it to a "diff-records" fn
	-- and have that fn emit column-name,orig,pcr,prod
	-- for each differing column
	end $fn$ language plpgsql;
	SQL

	# ===== TEST 1 =====
	db.exec <<~SQL
	-- composite primary key with case sensitive column names
	create table parent (id1 integer, "iD2" integer, val1 text, primary key (id1, "iD2"));
	create table pcr (id1 integer, "iD2" integer, val1 text, primary key (id1, "iD2"));
	create table prod (id1 integer, "iD2" integer, val1 text, primary key (id1, "iD2"));
	insert into parent (id1, "iD2", val1)
	values (1, 1, 'unchanged'),
	(2, 1, 'deleted-pcr'),
	(2, 2, 'deleted-prod'),
	(2, 3, 'deleted-both'),
	(3, 1, 'modified-pcr'),
	(3, 2, 'modified-prod'),
	(3, 3, 'modified-both-same'),
	(3, 4, 'modified-both-different');
	insert into pcr (id1, "iD2", val1)
	values (1, 1, 'unchanged'),
	(2, 2, 'deleted-prod'),
	(3, 1, 'MODIFIED-PCR'),
	(3, 2, 'modified-prod'),
	(3, 3, 'MODIFIED-BOTH-SAME'),
	(3, 4, 'MODIFIED-BOTH-DIFFERENT-PCR'),
	(4, 1, 'added-pcr');
	insert into prod (id1, "iD2", val1)
	values (1, 1, 'unchanged'),
	(2, 1, 'deleted-pcr'),
	(3, 1, 'modified-pcr'),
	(3, 2, 'MODIFIED-PROD'),
	(3, 3, 'MODIFIED-BOTH-SAME'),
	(3, 4, 'MODIFIED-BOTH-DIFFERENT-PROD'),
	(4, 2, 'added-prod');
	SQL
	result = db.exec("select * from diff_table('parent', 'pcr', 'prod')").sort_by { \|r\| r['pks'] }
	EXPECTED_IDS = %w[(2,1) (2,2) (3,1) (3,2) (3,4) (4,1) (4,2)]
	ACTUAL_IDS = result.map { \|row\| row['pks'] }
	headers = result.first.to_h.keys
	rows = [
	headers,
	headers.map { \|h\| '-'*h.length },
	*result.map { \|h\|
	[ h["pks"],
	h["table_name"].inspect,
	h["column_name"].inspect,
	h["orig_value"].inspect,
	h["pcr_value"].inspect,
	h["prod_value"].inspect,
	]
	}
	]
	format = rows.transpose.map { \|col\| col.map(&:length).max }.map { \|n\| "%-#{n}s" }.join(" ")
	rows.each { \|row\| puts (format % row).strip }

	if EXPECTED_IDS != ACTUAL_IDS
	EXPECTED_IDS # =>
	ACTUAL_IDS # =>
	raise "EXPECTED: #{EXPECTED_IDS.inspect}\nACTUAL: #{ACTUAL_IDS.inspect}"
	end

	# >> pks table_name column_name orig_value pcr_value prod_value
	# >> --- ---------- ----------- ---------- --------- ----------
	# >> (2,1) "parent" "FIXME" "deleted-pcr" nil "deleted-pcr"
	# >> (2,2) "parent" "FIXME" "deleted-prod" "deleted-prod" nil
	# >> (3,1) "parent" "FIXME" "modified-pcr" "MODIFIED-PCR" "modified-pcr"
	# >> (3,2) "parent" "FIXME" "modified-prod" "modified-prod" "MODIFIED-PROD"
	# >> (3,4) "parent" "FIXME" "modified-both-different" "MODIFIED-BOTH-DIFFERENT-PCR" "MODIFIED-BOTH-DIFFERENT-PROD"
	# >> (4,1) "parent" "FIXME" nil "added-pcr" nil
	# >> (4,2) "parent" "FIXME" nil nil "added-prod"