Skip to content

Instantly share code, notes, and snippets.

@shimarin
Last active February 21, 2021 05:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shimarin/7d7c4b77761f6f23a97bb0b49ead0806 to your computer and use it in GitHub Desktop.
Save shimarin/7d7c4b77761f6f23a97bb0b49ead0806 to your computer and use it in GitHub Desktop.
重複ファイル削除ツール
#!/usr/bin/python3
import os,argparse,glob
import xxhash
def calc_hash_from_file_handle(f):
h = xxhash.xxh64()
bufsize = h.block_size * 0x800
data = f.read(bufsize)
while data:
h.update(data)
data = f.read(bufsize)
return h.hexdigest()
def calc_hash(filename):
with open(filename, "rb") as f:
return calc_hash_from_file_handle(f)
def main(path, pattern):
filesize_table = {}
cnt = 0
for p in glob.iglob(os.path.join(path, "**", pattern), recursive=True):
if not os.path.isfile(p) or os.path.islink(p): continue
size = os.path.getsize(p)
if size in filesize_table:
filesize_table[size].append(p)
else:
filesize_table[size] = [p]
cnt += 1
filehash_table = {}
for pa in filesize_table.values():
if len(pa) < 2: continue
for p in pa:
hash = calc_hash(p)
if hash in filehash_table:
existing = filehash_table[hash]
print("%s already exists as %s with hash %s." % (p, existing, hash))
if len(os.path.basename(existing)) >= len(os.path.basename(p)):
print("Removing %s" % p, flush=True)
os.remove(p)
else:
print("Removing %s" % existing, flush=True)
os.remove(existing)
filehash_table[hash] = p
else: filehash_table[hash] = p
return cnt
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("path")
parser.add_argument("pattern")
args = parser.parse_args()
cnt = main(args.path, args.pattern)
print("%d files scanned." % cnt)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment