Skip to content

Instantly share code, notes, and snippets.

@Dillie-O
Created September 11, 2012 17:28
Show Gist options
  • Save Dillie-O/3700029 to your computer and use it in GitHub Desktop.
Save Dillie-O/3700029 to your computer and use it in GitHub Desktop.
Advanced Remove Duplicate Row Query in SQL
-- Algorithm --
-- 1. Run query to get all duplicate count values. Since we're getting cloned
-- rows, we can run the query based off of all fields in the table. Store
-- in table variable.
--
-- 2. Iterate through table.
--
-- 2a. Use DELETE TOP(x - 1) remove all duplicates.
--
-- 3. Done
DECLARE @RowsToProcess int
DECLARE @CurrentRow int
DECLARE @Message varchar(MAX)
DECLARE @duplicate_count int
DECLARE @application_id int
DECLARE @lock_version int
DECLARE @DuplicateVersions TABLE
(
row_id int NOT NULL PRIMARY KEY IDENTITY(1,1),
duplicate_count int,
application_id int,
lock_version int,
asset_id varchar(50),
customer_id int,
received_date datetime,
fee_amount money,
created_at datetime,
updated_at datetime
)
INSERT INTO @DuplicateVersions
(
duplicate_count,
application_id,
lock_version,
asset_id,
customer_id,
received_date,
fee_amount,
created_at,
updated_at
)
SELECT COUNT(*) AS duplicate_count,
[application_id],
[lock_version],
[asset_id],
[customer_id],
[received_date],
[fee_amount],
[created_at],
[updated_at]
FROM application_versions
GROUP BY [application_id],
[lock_version],
[asset_id],
[customer_id],
[received_date],
[fee_amount],
[created_at],
[updated_at]
HAVING COUNT(*) > 1
ORDER BY [application_id] ASC, [lock_version] ASC
SELECT @RowsToProcess = COUNT(*) FROM @DuplicateVersions
SET @Message = 'Duplicate rows retrieved. Records to process: ' + CAST(@RowsToProcess AS VARCHAR)
RAISERROR (@Message, 10, 1) WITH NOWAIT
SET @CurrentRow = 1
-- Since we're using a DELETE TOP (x) command and we cannot pass a column name
-- into the x parameter, we need to iterate through the table of duplicates and
-- call the delete statement, making sure to join all column names in our JOIN
-- clause since we are comparing against all values.
WHILE @CurrentRow <= @RowsToProcess
BEGIN
SELECT @duplicate_count = duplicate_count,
@application_id = application_id,
@lock_version = lock_version
FROM @DuplicateVersions AS DuplicateVersions
WHERE row_id = @CurrentRow
SET @Message = '[' + CAST(@CurrentRow AS VARCHAR) + ' of ' + CAST(@RowsToProcess AS VARCHAR) + '] AppID: ' + CAST(@application_id AS VARCHAR) + ' / Lock: ' + CAST(@lock_version AS VARCHAR) + ' - ' + CAST((@duplicate_count - 1) AS VARCHAR) + ' Duplicate(s)'
RAISERROR (@Message, 10, 1) WITH NOWAIT
DELETE TOP (@duplicate_count - 1) application_versions
FROM application_versions
RIGHT JOIN @DuplicateVersions AS DuplicateVersions
ON ISNULL(DuplicateVersions.application_id, -999) = ISNULL(application_versions.application_id, -999)
AND ISNULL(DuplicateVersions.lock_version, -999) = ISNULL(application_versions.lock_version, -999)
AND ISNULL(DuplicateVersions.asset_id, -999) = ISNULL(application_versions.asset_id, -999)
AND ISNULL(DuplicateVersions.customer_id, -999) = ISNULL(application_versions.customer_id, -999)
AND ISNULL(DuplicateVersions.received_date, -999) = ISNULL(application_versions.received_date, -999)
AND ISNULL(DuplicateVersions.fee_amount, -999) = ISNULL(application_versions.fee_amount, -999)
AND ISNULL(DuplicateVersions.created_at, -999) = ISNULL(application_versions.created_at, -999)
AND ISNULL(DuplicateVersions.updated_at, -999) = ISNULL(application_versions.updated_at, -999)
WHERE DuplicateVersions.row_id = @CurrentRow
SET @CurrentRow = @CurrentRow + 1
END
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment