Skip to content

Instantly share code, notes, and snippets.

@ratozumbi
Created May 28, 2020 17:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ratozumbi/dc1fe7dcdacd41a5329c4dc1fdf32961 to your computer and use it in GitHub Desktop.
Save ratozumbi/dc1fe7dcdacd41a5329c4dc1fdf32961 to your computer and use it in GitHub Desktop.
python script that hashes files recursively then compare each one to find duplicates
import hashlib
import os
def getHashes(path, fileHashes):
print("executing in "+ path)
for filename in os.listdir(path):
fullpath = os.path.join(path,filename)
if os.path.isfile(fullpath):
hasher = hashlib.md5()
with open(fullpath, 'rb') as afile:
#swap lines to work with big files
buf = afile.read()
hasher.update(buf)
# BLOCKSIZE = 65536
# buf = afile.read(BLOCKSIZE)
# while len(buf) > 0:
# hasher.update(buf)
# buf = afile.read(BLOCKSIZE)
fileHashes.write(hasher.hexdigest() + "##" + fullpath + "\n")
else:
getHashes(fullpath, fileHashes)
with open("./hashes.txt", "w+") as fileHashes:
getHashes('.', fileHashes)
dicHashPath = {}
with open("./hashes.txt", "r") as fileHashes:
with open("./result.txt", "w+") as searchResult:
hashLines = fileHashes.readlines()
for line in hashLines:
lineHashPath = line.split("##")
if lineHashPath[0] in dicHashPath:
print("===================\nCopia encontrada em " + lineHashPath[1] + "Original: " + dicHashPath[lineHashPath[0]] + "MD5: " +lineHashPath[0] )
searchResult.write("===================\nCopia encontrada em " + lineHashPath[1] + "Original: " + dicHashPath[lineHashPath[0]]+ "MD5: " +lineHashPath[0]+ "\n")
else:
dicHashPath[lineHashPath[0]] =lineHashPath[1]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment