Last active
February 21, 2021 05:40
-
-
Save shimarin/7d7c4b77761f6f23a97bb0b49ead0806 to your computer and use it in GitHub Desktop.
重複ファイル削除ツール
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
import os,argparse,glob | |
import xxhash | |
def calc_hash_from_file_handle(f): | |
h = xxhash.xxh64() | |
bufsize = h.block_size * 0x800 | |
data = f.read(bufsize) | |
while data: | |
h.update(data) | |
data = f.read(bufsize) | |
return h.hexdigest() | |
def calc_hash(filename): | |
with open(filename, "rb") as f: | |
return calc_hash_from_file_handle(f) | |
def main(path, pattern): | |
filesize_table = {} | |
cnt = 0 | |
for p in glob.iglob(os.path.join(path, "**", pattern), recursive=True): | |
if not os.path.isfile(p) or os.path.islink(p): continue | |
size = os.path.getsize(p) | |
if size in filesize_table: | |
filesize_table[size].append(p) | |
else: | |
filesize_table[size] = [p] | |
cnt += 1 | |
filehash_table = {} | |
for pa in filesize_table.values(): | |
if len(pa) < 2: continue | |
for p in pa: | |
hash = calc_hash(p) | |
if hash in filehash_table: | |
existing = filehash_table[hash] | |
print("%s already exists as %s with hash %s." % (p, existing, hash)) | |
if len(os.path.basename(existing)) >= len(os.path.basename(p)): | |
print("Removing %s" % p, flush=True) | |
os.remove(p) | |
else: | |
print("Removing %s" % existing, flush=True) | |
os.remove(existing) | |
filehash_table[hash] = p | |
else: filehash_table[hash] = p | |
return cnt | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("path") | |
parser.add_argument("pattern") | |
args = parser.parse_args() | |
cnt = main(args.path, args.pattern) | |
print("%d files scanned." % cnt) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment