Last active
February 22, 2019 10:32
-
-
Save kovid-rathee/f9e34051cba49e7c02e3c96ab36abafe to your computer and use it in GitHub Desktop.
Remove duplicates in PostgreSQL and Amazon Redshift
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- when we want to keep only the first copy of a record by ID (where ID is integer). | |
-- Change min to max, when you want to keep the latest | |
DELETE FROM tab as t0 | |
WHERE t0.id <> (SELECT min(t1.id) FROM tab as t1 WHERE t0.id = t1.id); | |
-- when we want to identify distinct on the basis of selected columns, not the complete record | |
DELETE FROM tab t0 WHERE t0.id NOT IN | |
(SELECT t1.id FROM ( | |
SELECT DISTINCT ON (col_1, col_2, col_2, ..., col_n) * | |
FROM tab t1)); | |
-- when we want to use CTE to SELECT unique records and then delete everything that is not in the CTE | |
WITH unique_records AS | |
(SELECT DISTINCT ON (col_1, col_2, col_3, ... , col_n) * FROM tab t0) | |
DELETE FROM tab t1 WHERE t1.id NOT IN (SELECT id FROM unique_records); | |
-- using ctid (unique identified of row's physical storage) as a unique identifier of a row | |
DELETE FROM tab | |
WHERE ctid IN (SELECT ctid FROM | |
(SELECT *, | |
ctid, | |
ROW_NUMBER() OVER (PARTITION BY COL_1, COL_2, COL_3, ..., COL_N ORDER BY ctid) as rn | |
FROM tab) s | |
WHERE rn >= 2); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment