Skip to content

Instantly share code, notes, and snippets.

@abaines
Created November 6, 2020 23:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save abaines/d9f3ff96a11f432440542d66d0144ab6 to your computer and use it in GitHub Desktop.
Save abaines/d9f3ff96a11f432440542d66d0144ab6 to your computer and use it in GitHub Desktop.
Search for duplicate files
import os
import sys
import hashlib
print('cwd',os.getcwd())
def hashFile(f):
return hashlib.sha224(open(f,'rb').read()).hexdigest()
def searchFolder(folder):
fileNames = [f for f in os.listdir(folder) if os.path.isfile(os.path.abspath(os.path.join(folder,f)))]
hashes = {}
for fileName in fileNames:
fullFileName = os.path.abspath(os.path.join(folder,fileName))
print(fullFileName)
if (".jar" not in fullFileName):
continue
h = hashFile(fullFileName)
if h not in hashes:
hashes[h] = []
hashes[h].append(fullFileName)
for k,vals in hashes.items():
if len(vals)>1:
print(h)
for v in vals:
print (v)
print()
def findDiffs(path):
dmap = {}
for root, directories, filenames in os.walk(path):
for directory in directories:
pass
#print os.path.join(root, directory)
for filename in filenames:
fullName = os.path.join(root,filename)
if fullName.endswith(".jar"):
#print (fullName)
hash = hashFile(fullName)
#print(hash)
if filename not in dmap:
dmap[filename] = {}
if hash not in dmap[filename]:
dmap[filename][hash] = []
dmap[filename][hash].append(fullName)
for key in dmap:
l = len(dmap[key])
if l != 1:
print(key + " " + str(l))
for k2 in dmap[key]:
a = dmap[key][k2]
print(" " + k2 + " " + str(len(a)))
for el in a:
print(" " + el)
def findDups(path):
hashDict = {}
for root, directories, filenames in os.walk(path):
for filename in filenames:
fullName = os.path.join(root,filename)
hash = hashFile(fullName)
if hash not in hashDict:
hashDict[hash] = []
hashDict[hash].append(fullName)
for key in hashDict:
l = len( hashDict[key] )
if l>1:
print(l,hashDict[key])
a = hashDict[key]
print(a[0])
os.rename(a[0],a[0]+"__")
#print(hashDict)
findDups("E:\\Mosaic Images")
print("Complete")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment