Last active
January 23, 2024 13:43
-
-
Save maknahar/4e632973b58717a276536676c2ebc227 to your computer and use it in GitHub Desktop.
Query to find and delete duplicate records from Mongodb (Works for millions of records)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** Create a array to store all duplicate records ids*/ | |
var duplicates = []; | |
/** Start Aggregation pipeline*/ | |
db.collection.aggregate([ | |
{ | |
$match: { /** Add any filter here. Add index for filter keys*/ | |
filterKey: { | |
$exists: false | |
} | |
} | |
}, | |
{ | |
$sort: { /** Sort it in such a way that you want to retain first element*/ | |
sortKey: -1 | |
} | |
}, | |
{ | |
$group: { | |
_id: { | |
key1: "$key1", key2:"$key2" /** These are the keys which define the duplicate. Here document with same value for key1 and key2 will be considered duplicate*/ | |
}, | |
dups: { | |
$push: { | |
_id: "$_id" | |
} | |
}, | |
count: { | |
$sum: 1 | |
} | |
} | |
}, | |
{ | |
$match: { | |
count: { | |
"$gt": 1 | |
} | |
} | |
} | |
], | |
{ | |
allowDiskUse: true | |
}).forEach(function(doc){ | |
doc.dups.shift(); | |
doc.dups.forEach(function(dupId){ | |
duplicates.push(dupId._id); | |
}) | |
}) | |
/** Delete the duplicates*/ | |
var i,j,temparray,chunk = 100000; | |
for (i=0,j=duplicates.length; i<j; i+=chunk) { | |
temparray = duplicates.slice(i,i+chunk); | |
db.collection.bulkWrite([{deleteMany:{"filter":{"_id":{"$in":temparray}}}}]) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment