Skip to content

Instantly share code, notes, and snippets.

@lihas
Created February 6, 2022 16:57
Show Gist options
  • Save lihas/e7b102000afad3856c2c7efa68e0499d to your computer and use it in GitHub Desktop.
Save lihas/e7b102000afad3856c2c7efa68e0499d to your computer and use it in GitHub Desktop.
This python script compares files in two directories on their md5 hash. lists the files which have same md5, and the ones which don't. I use this to verify my file backups. The directory structures are different hence a direct folder comparison with tools like beyond compare (compare on contents) wont work.
# This is a Python script.
#This python script compares files in two directories
# on their md5 hash. lists the files which have same md5,
# and the ones which don't. I use this to verify my file backups.
# The directory structures are different hence a direct folder
# comparison with tools like beyond compare (compare on contents) wont work.
import glob
import os
import hashlib
from datetime import datetime
dir1Path = "D:/"
dir2Path = "E:/LighroomLibrary"
# Press the green button in the gutter to run the script.
def ProcessPath(dirPath):
cwd = os.getcwd()
os.chdir(dirPath)
fileHashMap = {}
fileList = glob.glob("**", recursive=True)
for file in fileList:
if os.path.isfile(file):
fhandle = open(file, 'rb')
md5hash = hashlib.md5(fhandle.read()).hexdigest()
fhandle.close()
if md5hash in fileHashMap.keys():
#print("duplicate", fileHashMap[md5hash], file)
fileHashMap[md5hash].append(file)
else:
fileHashMap[md5hash] = [file]
#print(fileHashMap)
os.chdir(cwd)
return fileHashMap
startTime = datetime.now()
print("startTime", startTime)
dir1hashMap = ProcessPath(dir1Path)
print("path1 processed", datetime.now() - startTime)
dir2hashMap = ProcessPath(dir2Path)
print("path2 processed", datetime.now() - startTime)
sameFiles = {}
dir1ExtraFiles = {}
dir2ExtraFiles = {}
for key in dir1hashMap:
if key in dir2hashMap.keys():
sameFiles[key] = [] + dir1hashMap[key] + dir2hashMap[key]
else:
dir1ExtraFiles[key] = dir1hashMap[key]
for key in dir2hashMap:
if key in dir1hashMap.keys():
pass
else:
dir2ExtraFiles[key] = dir2hashMap[key]
print("same files")
for key in sameFiles:
print(key, sameFiles[key])
print("Dir1 extra files")
for key in dir1ExtraFiles:
print(key, dir1ExtraFiles[key])
print("Dir2 extra files")
for key in dir2ExtraFiles:
print(key, dir2ExtraFiles[key])
print("runTime", datetime.now() - startTime)
# See PyCharm help at https://www.jetbrains.com/help/pycharm/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment