Skip to content

Instantly share code, notes, and snippets.

@arpanpal010
Last active October 28, 2015 06:30
Show Gist options
  • Save arpanpal010/41f927fc78832983d259 to your computer and use it in GitHub Desktop.
Save arpanpal010/41f927fc78832983d259 to your computer and use it in GitHub Desktop.
Duplicate mover/remover based on file hashes.
#!/usr/bin/python
from __future__ import print_function
import sys, hashlib, os, shutil, time
from collections import Counter
#function to remove duplicates in folder by genrating and comparing their hash values
#has option to remove file automatically or move them to a separate place.
#add locations here where duplicacy may occur, (absolute/paths/only/with/trailing/)
global dir_to_check
#all duplicates will be moved into dir_to_move ignoring their relative filestructure.
global dir_to_move
#dir_to_move="/media/xt1/BackThisUp/devices/s2/duplicates/"
#function to get list of files, recursive so more deph==heavier
def scour(rootdirpath, filetype=None, exclude=[], follow_links=False):
#scans and returns sorted list[files, folders, links], given root directorypath...ends in /
#pass list of filetypes as filter (eg ['.txt', '.zip'] or just '.txt' if single element) #MIND THE DOT
#excludes if item in exclude[list] (eg ['.txt', 'directoryname', 'symlinkname'])
#set follows_links=True to include symlink resolve, only if pointed file or dir not already in list #DOES NOT FOLLOW LINKS
files=[] #file list
dirs=[] #dirs list
links=[] #links list
def rectree(filepath): #recursive tree to scan files/folders
#print filepath
filelist=os.listdir(filepath)
for item in filelist:
#print item
if (os.path.splitext(item)[1] not in exclude) & (str(item) not in exclude): #if extension or dirname or linkname not in excludelist
#if file, and (if specified) file extension is in filetype
if os.path.isfile(os.path.join(filepath, item)):
if (filetype==None) or (os.path.splitext(item)[1] in filetype): # if filter is provided, append only files that match the extensions
files.append(os.path.join(filepath, item))
#if dir
if os.path.isdir(os.path.join(filepath, item)):
dirs.append(os.path.join(filepath, item))
rectree(os.path.join(filepath, item))
#if symlink
if os.path.islink(os.path.join(filepath, item)):
links.append(os.path.join(filepath, item))
#if follow_links==True: #DOES NOT FOLLOW LINKS
# realpath=os.path.realpath(filepath+str(item)) #resolve realpath
# if rootdirpath not in realpath: #if realpath not in root directory (avoids loops) include realpath in scouring
# filelist.append(realpath)
#run function, then sort and serve
if type(rootdirpath)==str: #single path
if os.path.isdir(rootdirpath): #if directory exists
rectree(rootdirpath)
elif type(rootdirpath)==list or type(rootdirpath)==tuple: #many paths in list/tuple
for item in rootdirpath:
rectree(item)
#if not os.path.isdir(rootdirpath):
# return
#rectree(rootdirpath)
files.sort() #turn off sorting if too heavy
dirs.sort()
links.sort()
return files, dirs, links
#deprecated
#def titlecheck(rootdirpath, filetype=None, exclude=[]): #extract titles, convert to counter, check duplicates, print ones which occur >1 with their path
# titles=[]
# duptitles=[]
# oplist=[]
# filepathlist=scour(rootdirpath, filetype=filetype, exclude=exclude)[0]
# for path in filepathlist:
# titles.append(os.path.basename(path))
# titlecount=Counter(titles)
# for path in filepathlist:
# title=os.path.basename(path)
# if titlecount[title] > 1: #select only titles that have more than 1 occurance
# if title not in duptitles:
# print "Original:", title, path
# duptitles.append(title)
# else:
# print "Duplicate:", title, path
# oplist.append(path)
# duptitles.sort()
# print "\n\nTotal Files:", len(filepathlist)
# print "Total unique filenames:", len(titlecount)
# print "Total filenames that have duplicates:", len(duptitles), "\n\n"
# print oplist
# return oplist #list of files that have duplicate names
#deprecated
#def md5sum(filepath): #hash func
# md5 = hashlib.md5()
# with open(filepath,'rb') as f:
# for chunk in iter(lambda: f.read(128*md5.block_size), b''):
# md5.update(chunk)
# return md5.hexdigest()
def sha1sum(filepath): #hash func
sha1 = hashlib.sha1()
with open(filepath,'rb') as f:
for chunk in iter(lambda: f.read(128*sha1.block_size), b''):
sha1.update(chunk)
return sha1.hexdigest()
def hashcheck(rootdirpath, filetype=None, exclude=[]): #gen filelist, list hashes, detect duplicates by hashes, return duplicate files
i=1
#j=1
hashes=[]
hashlist=[]
duphashes=[]
oplist=[]
filepathlist=scour(rootdirpath, filetype=filetype, exclude=exclude)[0]
print("\nTotal Files:", len(filepathlist))
for path in filepathlist: #hashing
print("Hashing done:", i*100/len(filepathlist))
title=os.path.basename(path)
filehash=sha1sum(path)
hashes.append(filehash)
hashlist.append([filehash, title, path])
i+=1
print("\nFinished hashing...")
hashcount=Counter(hashes)
#print hashcount
hashlist.sort()
print("Checking for duplicates...")
for fhash in hashcount: #getting duplicates
#j+=1
#print "Checking done:", j*100/len(hashcount)
if hashcount[fhash]>1:
for item in hashlist:
filehash, title, path=item
if fhash==filehash and filehash not in duphashes:
print("\nOriginal:", title, "("+filehash+")")
print("Copies:", hashcount[filehash]-1)
print(path)
duphashes.append(filehash)
elif fhash==filehash and filehash in duphashes:
print("\nDuplicate:", title, "("+filehash+")","\n", path)
oplist.append(path)
print("\n\nStats:\nTotal Files:", len(filepathlist))
print("Total unique files:", len(hashcount))
print("Total files that have duplicates:", len(duphashes))
print("Total files that are duplicates:", len(oplist), "\n\n")
#print oplist
return oplist
#moving files
#print hashcheck("/media/xt1/BackThisUp/devices/s2/duplicates/", filetype=['.jpg', '.jpeg', '.png', '.tif', '.webm'])
# shutil.move(item, "/media/xt1/BackThisUp/devices/s2/duplicates/"+os.path.basename(item))
def main(rootdirpath, filetype=None, exclude=[], move_files=True, move_dir='', remove_files=False):
#get files to move/remove from hashcheck
#filelist=[]
#for item in rootdirpath:
filelist=hashcheck(rootdirpath, filetype=filetype, exclude=exclude) #returns all duplicate files
if not filelist:
print( "Yay! No duplicate items. :)")
return
#else start moving
if move_files and os.path.isdir(move_dir):
print("Moving duplicates to:", move_dir)
for item in filelist: #moving files
print("Moving:", os.path.basename(item))
#make destination filepath= common
commonpath=os.path.commonprefix([item, move_dir])
destdir=os.path.join(move_dir, item.replace(commonpath, '').replace(os.path.basename(item), ''))
#print os.path.join(move_dir, destdir)
#print destdir
if not os.path.exists(destdir):
try:
os.makedirs(destdir)
except OSError:
if not os.path.isdir(destdir):
raise
try:
shutil.move(item, destdir)
except: #file exists
shutil.move(item, os.path.join(destdir, os.path.basename(item)+time.strftime("%Y%m%d%H%M%S"))) #doesn't replace unless specified
elif remove_files:
for item in filelist: #USE WITH CAUTION
print("Removing:", os.path.basename(item))
os.remove(item)
if __name__=="__main__":
if len(sys.argv)>=2:
dir_to_check=os.path.abspath(sys.argv[1])
dir_to_move=os.path.abspath(sys.argv[2]) if len(sys.argv)>=3 and os.path.exists(os.path.abspath(sys.argv[2])) else os.path.join(dir_to_check, "__duplicates__")
#create duplicates dir if not exists
if not os.path.exists(dir_to_move):
try:
os.makedirs(dir_to_move)
except OSError:
if not os.path.isdir(dir_to_move):
raise
print("Cheking --> ", dir_to_check)
print("Duplicates --> ", dir_to_move)
#commonpath=os.path.commonprefix([dir_to_check, dir_to_move])
#relpaths=os.path.relpath(dir_to_check, dir_to_move)
#print commonpath, relpaths
#print dir_to_move+dir_to_check.replace(commonpath, '/')
main(dir_to_check, move_files=True, move_dir=dir_to_move, exclude=['__duplicates__'])
#show usage
else:
print("Usage:\npython", sys.argv[0], "/path/to/check", "/path/where/duplicates/go")
#dir_to_check='/home/arch/scrap/pix/'
#dir_to_check=sys.argv[0]
#dir_to_move="/home/arch/scrap/pix/duplicates/"
#main(dir_to_check, move_files=True, move_dir=dir_to_move, exclude=['duplicates'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment