Skip to content

Instantly share code, notes, and snippets.

@YaYaB
Last active October 22, 2019 15:42
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save YaYaB/675ee59c76daedab966f134a207bf581 to your computer and use it in GitHub Desktop.
Save YaYaB/675ee59c76daedab966f134a207bf581 to your computer and use it in GitHub Desktop.
Detect identical files
import hashlib
from glob import glob
from pathlib import Path
import argparse
import os
def get_args():
parser = argparse.ArgumentParser(
"Detect identical files."
)
parser.add_argument("--path", type=str, help="absolute path to folder to analyze")
parser.add_argument("--recursive", action='store_true', default=False, help="Detect indetical files recursively in subfolders")
args = parser.parse_args()
return args
def md5Checksum(filePath):
with open(filePath, 'rb') as fh:
m = hashlib.md5()
while True:
data = fh.read(8192)
if not data:
break
m.update(data)
return m.hexdigest()
if __name__ == "__main__":
# Load parameters
opt = get_args()
path = opt.path # Put path of the data
# Use or not recursive
if opt.recursive:
rec = '**/*'
else:
rec = '*'
hashmap = {}
for f in Path(path).glob(rec):
if not os.path.isfile(f):
continue
has_ = md5Checksum(f)
if has_ in hashmap:
hashmap[has_].append(os.path.realpath(f))
else:
hashmap[has_] = [os.path.realpath(f)]
for hash_,values in hashmap.items():
if len(values)>1:
print("files with hash ", hash_, " are:", values)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment