Created
January 11, 2017 07:00
-
-
Save samba/4b593ba8c7f3d9991d50aa1ef70177a0 to your computer and use it in GitHub Desktop.
MD5 Index Scanner
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Scan an index of files, typically keyed by MD5 checksum, in the format produced by | |
GNU `md5sum` (not BSD-style), searching for one of many checksums listed in a pattern file. | |
The primary objective is to help find duplicates in a large collection of files, e.g. an | |
archive of music. Example: | |
> find ./music -type f -print0 | xargs -0 md5sum | sort > index.txt | |
> cut -c 1-33 index.txt | uniq -c | awk '{ if($1 > 1){ print $2 } }' > dupe_checksums.txt | |
> python scandupes.py dupe_checksums.txt index.txt | xargs -d "\n" mv -vn "{}" ./dupes/ | |
File Examples: | |
<index file> | |
009d2ba87423037d2b428953530720ac ./file1 | |
019f8ae032f0db625f6de28226ca0af6 ./file2 | |
<pattern file> | |
009d2ba87423037d2b428953530720ac | |
109d2ba87423037d2b428953530720ac | |
209d2ba87423037d2b428953530720ac | |
""" | |
import fileinput | |
import sys | |
import os | |
def notify(text, *args, **kwargs): | |
print >>sys.stderr, text.format(*args, **kwargs) | |
def scan_checksums(filename): | |
for line in fileinput.FileInput(filename): | |
parts = line.strip().split(' ') | |
yield (parts[0], (parts[1] if len(parts) > 1 else None)) | |
def find_matching(pattern_file, scan_file): | |
notify('Loading pattern file {0}', pattern_file) | |
search = set([ p[0] for p in scan_checksums(pattern_file) ]) | |
notify('Scanning index file {0} with {1:d} patterns', | |
scan_file, len(search)) | |
for checksum, filename in scan_checksums(scan_file): | |
if checksum in search: | |
yield filename | |
def main(args): | |
pattern_file = args[0] | |
scan_file = args[1] | |
for filename in find_matching(pattern_file, scan_file): | |
if os.path.isfile(filename): | |
yield filename | |
if __name__ == '__main__': | |
for line in main(sys.argv[1:]): | |
print line |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment