Skip to content

Instantly share code, notes, and snippets.

@omerida
Created March 30, 2012 13:59
Show Gist options
  • Save omerida/2251728 to your computer and use it in GitHub Desktop.
Save omerida/2251728 to your computer and use it in GitHub Desktop.
Mysql find and update duplicate fields before creating a unique index
# create temp table
CREATE TEMPORARY TABLE link_dupes (
`id` int(11) unsigned NOT NULL,
`user_id` int(11) unsigned NOT NULL,
`link` varchar(1000) NOT NULL,
`link_unique` varchar(1000) NOT NULL
);
# find rows with duplicate link fields, into temp table
# add a hash to the url so that it still works
INSERT INTO `link_dupes`
SELECT id, user_id, link, CONCAT(link, '#', CONV(id, 10, 36)) FROM `user_links` uout WHERE EXISTS (
SELECT 1
FROM `user_links` uin
WHERE uout.link=uin.link AND uout.user_id=uin.user_id
LIMIT 1,100
)
;
# update user_links table with deduped links
INSERT INTO `user_links` (id, user_id, link)
SELECT id, user_id, link_unique FROM `link_dupes`
ON DUPLICATE KEY UPDATE link=VALUES(link);
# now we can create a unique index
# hash the link using SHA1 so that the fingerprint for a link is a known length that we can index.
ALTER TABLE `user_links` ADD `link_hash` char(160) NOT NULL AFTER `link`;
UPDATE `user_links` SET `link_hash`=SHA1(`link`);
ALTER TABLE `user_links` ADD UNIQUE KEY `idx_user_hash_link` (`user_id`,`link_hash`);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment