Skip to content

Instantly share code, notes, and snippets.

@kapravel
Created May 12, 2014 22:44
Show Gist options
  • Save kapravel/773899af97d0a5ab774f to your computer and use it in GitHub Desktop.
Save kapravel/773899af97d0a5ab774f to your computer and use it in GitHub Desktop.
Removing duplicate files in GridFS
// Run this script if the following command fails to clear duplicates
// > db.fs.files.ensureIndex( { md5: 1}, {unique: true, dropDups: true} )
// detect duplicates in db.fs.files
m = function () {
emit(this.md5, 1);
}
r = function (k, vals) {
return Array.sum(vals);
}
res = db.fs.files.mapReduce(m, r, { out : "myDupesCollection" });
// delete duplicate files AND their corresponding chunks
db.myDupesCollection.find({value: {$gt:1}}).forEach(
function(obj) {
var cur = db.fs.files.find({ md5: obj._id }, {_id: 1});
var first = true;
while (cur.hasNext()) {
var doc = cur.next();
if (first) {first = false; continue;}
print ( doc._id )
db.fs.files.remove({ _id: doc._id });
db.fs.chunks.remove({ files_id: doc._id})
}
})
// delete dups collection if you don't need it anymore
//db.myDupesCollection.drop()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment