wwalker/gist:480511

## gistfile1.txt
#!/bin/sh
#
# Simple file based dedupe producer.
# Idea is to run this on a directory(s) and the result is a tuple
# of md5sum and filenames that are potentially duplicate (need to
# finish by using cmp to make sure).
# Copyright 2010 Sterling Commerce, Inc.
# Copyright 2010 Christopher Jay Cox
#
# http://endlessnow.com/ten/Source/dedupe-sh.txt
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# Filters here: size, inode, md5sum
#
# md5sum, like a cmp, is the most expensive item, so ideally, it should be
# the last filter run.  The inode check, while fast, is more intersting
# only in certain cases as the 1st filter run.  In some cases, you may
# determine to skip it altogether, but realize that the program might
# give you duplicate files that are really the SAME file.
#
# Output is a list of md5sum filename tuples ordered by matching md5sum.
# So ultimately, the md5sum becomes the key for the next step of processing
# in identifying what files are duplicates.  So, in the next step, your
# processing loop would to cmp's across the potentially duplicate files
# by md5sum (this is to avoid the rare case of md5sum collision, made even
# rarer by the size filter in particular).
#
# You could also consider a file type filter, if you are interested in
# duplicates of some particular file type.
# xargs -0 file | grep 'image data' | cut -f1 -d: | tr '\012' '\000' |
#
# Changelog
#
# 2010-07-18 - wwalker - changed sed | cut to more complex sed to prevent
# possible filename truncation
# 2010-07-18 - wwalker - changed 6s to 7s

find $* -type f -print0 |
xargs -0 ls -sd | sort -k1bn | uniq -w 7 -D |
sed 's/^  *[0-9]* *//' | tr '\012' '\000' |
xargs -0 ls -id | sort | uniq -w 7 |
sed 's/^  *[0-9]* *//' | tr '\012' '\000' |
xargs -0 md5sum | sort | uniq -w 32 -D
	#!/bin/sh
	#
	# Simple file based dedupe producer.
	# Idea is to run this on a directory(s) and the result is a tuple
	# of md5sum and filenames that are potentially duplicate (need to
	# finish by using cmp to make sure).
	# Copyright 2010 Sterling Commerce, Inc.
	# Copyright 2010 Christopher Jay Cox
	#
	# http://endlessnow.com/ten/Source/dedupe-sh.txt
	#
	# This program is free software; you can redistribute it and/or
	# modify it under the terms of the GNU General Public License
	# as published by the Free Software Foundation; either version 2
	# of the License, or (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program; if not, write to the Free Software
	# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
	#
	# Filters here: size, inode, md5sum
	#
	# md5sum, like a cmp, is the most expensive item, so ideally, it should be
	# the last filter run. The inode check, while fast, is more intersting
	# only in certain cases as the 1st filter run. In some cases, you may
	# determine to skip it altogether, but realize that the program might
	# give you duplicate files that are really the SAME file.
	#
	# Output is a list of md5sum filename tuples ordered by matching md5sum.
	# So ultimately, the md5sum becomes the key for the next step of processing
	# in identifying what files are duplicates. So, in the next step, your
	# processing loop would to cmp's across the potentially duplicate files
	# by md5sum (this is to avoid the rare case of md5sum collision, made even
	# rarer by the size filter in particular).
	#
	# You could also consider a file type filter, if you are interested in
	# duplicates of some particular file type.
	# xargs -0 file \| grep 'image data' \| cut -f1 -d: \| tr '\012' '\000' \|
	#
	# Changelog
	#
	# 2010-07-18 - wwalker - changed sed \| cut to more complex sed to prevent
	# possible filename truncation
	# 2010-07-18 - wwalker - changed 6s to 7s

	find $* -type f -print0 \|
	xargs -0 ls -sd \| sort -k1bn \| uniq -w 7 -D \|
	sed 's/^ [0-9] *//' \| tr '\012' '\000' \|
	xargs -0 ls -id \| sort \| uniq -w 7 \|
	sed 's/^ [0-9] *//' \| tr '\012' '\000' \|
	xargs -0 md5sum \| sort \| uniq -w 32 -D