Skip to content

Instantly share code, notes, and snippets.

@kebman
Created October 2, 2015 02:12
Show Gist options
  • Save kebman/4f925184c72b8c3a18db to your computer and use it in GitHub Desktop.
Save kebman/4f925184c72b8c3a18db to your computer and use it in GitHub Desktop.
Check for duplicate files with Node.js
// find duplicate files with Node.js, cuz, you know, it's useful
var Promise = require('bluebird');
var fs = Promise.promisifyAll(require('fs'));
var crypto = require('crypto');
var path = require('path');
var pathA = "."; // folder you're in, wherever that might be
var pathB = "/path/to/the/directory/you/want/to/compare/it/to"; // yes, yes, argv, etc, but I haven't bothered yet, ok!
var hashes = []; // where the magical hashes will be stored - but wait, that's not all, now also with file info!
// I dub this: promise hell! (or possibly this could just be done eons better, but well, it works...)
function hashDirIn(folder) {
var pathPromiseA = fs.readdirAsync(folder).map(function(fileName) {
var workPath = path.join(folder, fileName);
var statPromise = fs.statAsync(workPath);
return Promise.join(statPromise, fileName, function(statPromise, fileName) {
if(statPromise.isFile()) {
function makeStream(file, callback) {
var result = fs.createReadStream(workPath);
return callback(result);
}
function process(stream) {
var hash = crypto.createHash('md5');
return new Promise(function(resolve, reject) {
stream.on('data', function updateProcess(chunk) {
hash.update(chunk, 'utf8');
});
stream.on('end', resolve);
}).then(function publish() {
var digest = hash.digest('hex');
hashes.push({digest:digest, path:workPath});
});
}
return makeStream(fileName, process);
} // endif :p
});
}).then(function(){
// sort and display dupes
if(i==1) {
hashes.sort(function(a, b) {
if (a.digest < b.digest) {
return -1;
}
if (a.digest > b.digest) {
return 1;
}
return 0;
});
var dupe = 1;
hashes.map(function(obj, index) {
// note: currently this won't deal gracefully with more than two equal files at a time, though it will find them all (gotta find them all!)
if (index-1 >= 0) {
if(obj.digest == hashes[index-1].digest) {
console.log("Dupe "+dupe+" found:");
console.log(obj.path);
console.log("Equal to:")
console.log(hashes[index-1].path+"\n");
dupe++;
}
}
});
}
i++; // why do I feel this is cheating?
});
}
var i = 0;
hashDirIn(pathA);
hashDirIn(pathB);
// but how to check both concurrently instead of one after the other? Ah well, it's late and my bottle of Ileach is soon empty...
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment