Create a gist now

Instantly share code, notes, and snippets.

@cpancake /dedup.js
Last active Jun 25, 2017

What would you like to do?
var fs = require('graceful-fs'),
log = require('single-line-log').stdout,
crypto = require('crypto'),
path = require('path'),
async = require('async');
// compute the sha1 hash of the file
function hashFile(name, cb)
{
var hash = crypto.createHash('sha1'),
stream = fs.createReadStream(name);
stream.on('data', function (data) {
hash.update(data);
});
stream.on('end', function () {
cb(hash.digest('hex'));
});
}
// actually perform the deduplication
function runDedup(hashes)
{
var keys = Object.keys(hashes).filter((k) => hashes[k].length > 1);
console.log('\n');
console.log('found duplicates:');
keys.forEach((k) => {
console.log('\t' + hashes[k].join(', '));
});
// if we want to actually delete anything, delete all but the first one
if(process.argv[2] == 'keep-first')
{
log('deleting duplicates');
keys.forEach((k) => {
hashes[k].slice(1).forEach((f) => {
log('deleting ' + f);
fs.unlinkSync(f);
});
});
}
}
// find all non-directory files
log('finding files');
var files = fs.readdirSync('.');
files = files.filter(function(f) {
return !(fs.statSync(f).isDirectory());
});
log('found ' + files.length + ' files');
var hashes = {};
var numComplete = 0;
// compute hashes of all files found
var cbs = files.map(function(f) {
return (cb) => {
hashFile(f, function(hash) {
if(hashes[hash] != null)
{
hashes[hash].push(f);
}
else
{
hashes[hash] = [f];
}
numComplete++;
log('hashing ' + numComplete + '/' + files.length);
cb();
});
};
});
// dedup ten files at a time
async.parallelLimit(cbs, 10, (err, results) => runDedup(hashes));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment