Skip to content

Instantly share code, notes, and snippets.

@samba
Created January 11, 2017 07:00
Show Gist options
  • Save samba/4b593ba8c7f3d9991d50aa1ef70177a0 to your computer and use it in GitHub Desktop.
Save samba/4b593ba8c7f3d9991d50aa1ef70177a0 to your computer and use it in GitHub Desktop.
MD5 Index Scanner
#!/usr/bin/env python
"""
Scan an index of files, typically keyed by MD5 checksum, in the format produced by
GNU `md5sum` (not BSD-style), searching for one of many checksums listed in a pattern file.
The primary objective is to help find duplicates in a large collection of files, e.g. an
archive of music. Example:
> find ./music -type f -print0 | xargs -0 md5sum | sort > index.txt
> cut -c 1-33 index.txt | uniq -c | awk '{ if($1 > 1){ print $2 } }' > dupe_checksums.txt
> python scandupes.py dupe_checksums.txt index.txt | xargs -d "\n" mv -vn "{}" ./dupes/
File Examples:
<index file>
009d2ba87423037d2b428953530720ac ./file1
019f8ae032f0db625f6de28226ca0af6 ./file2
<pattern file>
009d2ba87423037d2b428953530720ac
109d2ba87423037d2b428953530720ac
209d2ba87423037d2b428953530720ac
"""
import fileinput
import sys
import os
def notify(text, *args, **kwargs):
print >>sys.stderr, text.format(*args, **kwargs)
def scan_checksums(filename):
for line in fileinput.FileInput(filename):
parts = line.strip().split(' ')
yield (parts[0], (parts[1] if len(parts) > 1 else None))
def find_matching(pattern_file, scan_file):
notify('Loading pattern file {0}', pattern_file)
search = set([ p[0] for p in scan_checksums(pattern_file) ])
notify('Scanning index file {0} with {1:d} patterns',
scan_file, len(search))
for checksum, filename in scan_checksums(scan_file):
if checksum in search:
yield filename
def main(args):
pattern_file = args[0]
scan_file = args[1]
for filename in find_matching(pattern_file, scan_file):
if os.path.isfile(filename):
yield filename
if __name__ == '__main__':
for line in main(sys.argv[1:]):
print line
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment