Skip to content

Instantly share code, notes, and snippets.

@mmasashi
Last active August 31, 2020 22:04
Show Gist options
  • Save mmasashi/32602818772b110071bc to your computer and use it in GitHub Desktop.
Save mmasashi/32602818772b110071bc to your computer and use it in GitHub Desktop.
Duplicate Record Delete Query Generator For Amazon Redshift
# Duplicate record delete query generator for Amazon Redshift
# By running a generated query, duplicate rows in a specified table will be removed.
#
# Usage:
# ruby delete_dup_records_redshift.rb <table-name> <priary-keys-with-comma-separator>
unless ARGV.count == 2
puts <<EOT
Usage:
ruby delete_dup_records_redshift.rb <table name> <primary keys>
Example:
# Single primary key
ruby delete_dup_records_redshift.rb users id
# Composite primary keys
ruby delete_dup_records_redshift.rb users_groups user_id,group_id
# Specify schema name
ruby delete_dup_records_redshift.rb public.users id
EOT
exit 1
end
table = ARGV[0] # ex: 'm_test_table_multi_pk'
primary_keys = ARGV[1].split(',') # ex: 'id1,id2'
temp_table = "#{table}_temp_for_dup_rows_#{Time.now.strftime('%Y%m%d_%H%M%S')}"
#### Main
QUERY_TMPL = <<EOT
-- Check duplicate row count
SELECT '%{table}' as table, count(*) as num_dup_keys FROM (SELECT %{primary_keys} FROM %{table} GROUP BY %{primary_keys} HAVING count(*) <> 1);
-- Delete duplicate rows
BEGIN;
LOCK %{table};
SELECT count(*) FROM (SELECT %{primary_keys} FROM %{table} GROUP BY %{primary_keys} HAVING count(*) <> 1);
CREATE TABLE %{temp_table} (LIKE %{table});
INSERT INTO %{temp_table} (SELECT distinct a.* FROM %{table} a, (SELECT %{primary_keys} FROM %{table} GROUP BY %{primary_keys} HAVING count(*) <> 1) b where %{insert_condition} );
DELETE FROM %{table} using %{temp_table} where %{delete_condition};
INSERT INTO %{table} (select * from %{temp_table});
DROP TABLE %{temp_table};
END;
EOT
puts QUERY_TMPL % {
table: table,
temp_table: temp_table,
primary_keys: primary_keys.join(','),
insert_condition: primary_keys.collect{|pk| "a.#{pk} = b.#{pk}"}.join(" AND "),
delete_condition: primary_keys.collect{|pk| "#{table}.#{pk} = #{temp_table}.#{pk}"}.join(" AND "),
}
# Single primary key
$ ruby delete_dup_records_redshift.rb m_test_table id
-- Check duplicate row count
SELECT 'm_test_table' as table, count(*) as num_dup_keys FROM (SELECT id FROM m_test_table GROUP BY id HAVING count(*) <> 1);
-- Delete duplicate rows
BEGIN;
LOCK m_test_table;
SELECT count(*) FROM (SELECT id FROM m_test_table GROUP BY id HAVING count(*) <> 1);
CREATE TABLE m_test_table_temp_for_dup_rows_20160315_153707 (LIKE m_test_table);
INSERT INTO m_test_table_temp_for_dup_rows_20160315_153707 (SELECT distinct a.* FROM m_test_table a, (SELECT id FROM m_test_table GROUP BY id HAVING count(*) <> 1) b where a.id = b.id );
DELETE FROM m_test_table using m_test_table_temp_for_dup_rows_20160315_153707 where m_test_table.id = m_test_table_temp_for_dup_rows_20160315_153707.id;
INSERT INTO m_test_table (select * from m_test_table_temp_for_dup_rows_20160315_153707);
DROP TABLE m_test_table_temp_for_dup_rows_20160315_153707;
END;
# Composite primary keys
$ ruby delete_dup_records_redshift.rb m_test_table_multi_pk id1,id2
-- Check duplicate row count
SELECT 'm_test_table_multi_pk' as table, count(*) as num_dup_keys FROM (SELECT id1,id2 FROM m_test_table_multi_pk GROUP BY id1,id2 HAVING count(*) <> 1);
-- Delete duplicate rows
BEGIN;
LOCK m_test_table_multi_pk;
SELECT count(*) FROM (SELECT id1,id2 FROM m_test_table_multi_pk GROUP BY id1,id2 HAVING count(*) <> 1);
CREATE TABLE m_test_table_multi_pk_temp_for_dup_rows_20160315_153607 (LIKE m_test_table_multi_pk);
INSERT INTO m_test_table_multi_pk_temp_for_dup_rows_20160315_153607 (SELECT distinct a.* FROM m_test_table_multi_pk a, (SELECT id1,id2 FROM m_test_table_multi_pk GROUP BY id1,id2 HAVING count(*) <> 1) b where a.id1 = b.id1 AND a.id2 = b.id2 );
DELETE FROM m_test_table_multi_pk using m_test_table_multi_pk_temp_for_dup_rows_20160315_153607 where m_test_table_multi_pk.id1 = m_test_table_multi_pk_temp_for_dup_rows_20160315_153607.id1 AND m_test_table_multi_pk.id2 = m_test_table_multi_pk_temp_for_dup_rows_20160315_153607.id2;
INSERT INTO m_test_table_multi_pk (select * from m_test_table_multi_pk_temp_for_dup_rows_20160315_153607);
DROP TABLE m_test_table_multi_pk_temp_for_dup_rows_20160315_153607;
END;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment