kovid-rathee/remove_duplicates.sql

## remove_duplicates.sql
-- when we want to keep only the first copy of a record by ID (where ID is integer).
-- Change min to max, when you want to keep the latest
DELETE FROM tab as t0
WHERE t0.id <> (SELECT min(t1.id) FROM tab as t1 WHERE t0.id = t1.id);

-- when we want to identify distinct on the basis of selected columns, not the complete record
DELETE FROM tab t0 WHERE t0.id NOT IN
(SELECT t1.id FROM (
    SELECT DISTINCT ON (col_1, col_2, col_2, ..., col_n) *
  FROM tab t1));

-- when we want to use CTE to SELECT unique records and then delete everything that is not in the CTE
WITH unique_records AS
    (SELECT DISTINCT ON (col_1, col_2, col_3, ... , col_n) * FROM tab t0)
DELETE FROM tab t1 WHERE t1.id NOT IN (SELECT id FROM unique_records);

-- using ctid (unique identified of row's physical storage) as a unique identifier of a row
DELETE FROM tab
WHERE ctid IN (SELECT ctid FROM
(SELECT *,
        ctid,
        ROW_NUMBER() OVER (PARTITION BY COL_1, COL_2, COL_3, ..., COL_N ORDER BY ctid) as rn
 FROM tab) s
WHERE rn >= 2);
	-- when we want to keep only the first copy of a record by ID (where ID is integer).
	-- Change min to max, when you want to keep the latest
	DELETE FROM tab as t0
	WHERE t0.id <> (SELECT min(t1.id) FROM tab as t1 WHERE t0.id = t1.id);

	-- when we want to identify distinct on the basis of selected columns, not the complete record
	DELETE FROM tab t0 WHERE t0.id NOT IN
	(SELECT t1.id FROM (
	SELECT DISTINCT ON (col_1, col_2, col_2, ..., col_n) *
	FROM tab t1));

	-- when we want to use CTE to SELECT unique records and then delete everything that is not in the CTE
	WITH unique_records AS
	(SELECT DISTINCT ON (col_1, col_2, col_3, ... , col_n) * FROM tab t0)
	DELETE FROM tab t1 WHERE t1.id NOT IN (SELECT id FROM unique_records);

	-- using ctid (unique identified of row's physical storage) as a unique identifier of a row
	DELETE FROM tab
	WHERE ctid IN (SELECT ctid FROM
	(SELECT *,
	ctid,
	ROW_NUMBER() OVER (PARTITION BY COL_1, COL_2, COL_3, ..., COL_N ORDER BY ctid) as rn
	FROM tab) s
	WHERE rn >= 2);