Skip to content

Instantly share code, notes, and snippets.

@zhuth
Created June 10, 2015 13:08
Show Gist options
  • Save zhuth/e61582b6d4b82d0c8b65 to your computer and use it in GitHub Desktop.
Save zhuth/e61582b6d4b82d0c8b65 to your computer and use it in GitHub Desktop.
list duplicated files
#!/usr/bin/python
#coding: utf-8
import os, sys
import hashlib
path = sys.argv[1]
visited = set([])
if len(sys.argv) > 2:
for _ in open(sys.argv[2], 'r').readlines():
visited.add(_[_.rfind('\t')+1:])
if os.path.isfile(sys.argv[1]):
szmd5 = dict([])
lk = ''; lv = ''
for _ in open(sys.argv[1], 'r').readlines():
k = _[:_.find('/')]
v = _[_.find('/'):]
if lk == k:
if not k in szmd5:
szmd5[k] = [lv]
szmd5[k].append(v)
lk = k
lv = v
for k in szmd5:
for _ in szmd5[k]:
print _
exit()
for root, dirs, files in os.walk(path):
for _ in files:
fp = os.path.join(root, _)
if fp in visited:
continue
sys.stderr.write('%s\n' % fp)
try:
print os.path.getsize(fp), hashlib.md5(open(fp, 'rb').read(1024*1024)).hexdigest(), fp
except OSError, e:
print 'err', 'err', fp
sys.stderr.write('^ err: %s\n' % e.message)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment