Skip to content

Instantly share code, notes, and snippets.

@lihas
Created March 22, 2022 07:15
Show Gist options
  • Save lihas/3df58ca4a3379ebf0b7eda331014e494 to your computer and use it in GitHub Desktop.
Save lihas/3df58ca4a3379ebf0b7eda331014e494 to your computer and use it in GitHub Desktop.
# This is a Python script.
# Same as previous version except with thread worker
# for one particular run the previous script took 21 mins, while this one took 14 mins
# This python script compares files in two directories
# on their md5 hash. lists the files which have same md5,
# and the ones which don't. I use this to verify my file backups.
# The directory structures are different hence a direct folder
# comparison with tools like beyond compare (compare on contents) wont work.
import glob
import os
import hashlib
from datetime import datetime
import concurrent.futures
dir1Path = "D:/DCIM/100CANON"
dir2Path = "E:/LighroomLibrary"
# Press the green button in the gutter to run the script.
def GetFileHash(filePath):
if os.path.isfile(filePath):
try:
fhandle = open(filePath, 'rb')
md5hash = hashlib.md5(fhandle.read()).hexdigest()
fhandle.close()
return {"hash": md5hash, "path": filePath, "isFile":True}
except:
pass
else:
return {"hash": None, "path": None, "isFile": True}
def ProcessPath(dirPath, dirFutures):
cwd = os.getcwd()
os.chdir(dirPath)
fileList = glob.glob("**", recursive=True)
os.chdir(cwd)
for file in fileList:
file_path = dirPath + '/' + file
fut = concurrent.futures.ThreadPoolExecutor().submit(GetFileHash, file_path)
dirFutures.append(fut)
def ProcessFutures(futures):
fileHashMap = {}
for fut in concurrent.futures.as_completed(futures):
res = fut.result()
if res["isFile"]:
filePath = res["path"]
md5hash = res["hash"]
if md5hash in fileHashMap.keys():
fileHashMap[md5hash].append(filePath)
else:
fileHashMap[md5hash] = [filePath]
return fileHashMap
startTime = datetime.now()
print("startTime", startTime)
dir1Futures = []
ProcessPath(dir1Path, dir1Futures)
print("path1 processed", datetime.now())
dir2Futures = []
ProcessPath(dir2Path, dir2Futures)
print("path2 processed", datetime.now())
print("processing dir 1 futures", datetime.now())
dir1hashMap = ProcessFutures(dir1Futures)
print("processing dir 2 futures", datetime.now())
dir2hashMap = ProcessFutures(dir2Futures)
print("done processing futures", datetime.now())
sameFiles = {}
dir1ExtraFiles = {}
dir2ExtraFiles = {}
for key in dir1hashMap:
if key in dir2hashMap.keys():
sameFiles[key] = [] + dir1hashMap[key] + dir2hashMap[key]
else:
dir1ExtraFiles[key] = dir1hashMap[key]
for key in dir2hashMap:
if key in dir1hashMap.keys():
pass
else:
dir2ExtraFiles[key] = dir2hashMap[key]
print("same files")
for key in sameFiles:
print(key, sameFiles[key])
print("Dir1 extra files")
for key in dir1ExtraFiles:
print(key, dir1ExtraFiles[key])
print("Dir2 extra files")
for key in dir2ExtraFiles:
print(key, dir2ExtraFiles[key])
print("runTime", datetime.now() - startTime)
# See PyCharm help at https://www.jetbrains.com/help/pycharm/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment