Skip to content

Instantly share code, notes, and snippets.

@clementlefevre
Last active December 25, 2017 15:37
Show Gist options
  • Save clementlefevre/e3fa2567c98f580ebb73f96401d91d2c to your computer and use it in GitHub Desktop.
Save clementlefevre/e3fa2567c98f580ebb73f96401d91d2c to your computer and use it in GitHub Desktop.
Remove duplicates in folder
#!/usr/bin/python
import sys
import os
import hashlib
import pandas as pd
import pickle
def chunk_reader(fobj, chunk_size=1024):
"""Generator that reads a file in chunks of bytes"""
while True:
chunk = fobj.read(chunk_size)
if not chunk:
return
yield chunk
def create_hashes(paths):
hashes = {}
for path in paths:
for dirpath, dirnames, filenames in os.walk(path):
for filename in filenames:
full_path = os.path.join(dirpath, filename)
mhash = hashlib.md5()
for chunk in chunk_reader(open(full_path, 'rb')):
mhash.update(chunk)
file_id = (mhash.digest(), os.path.getsize(full_path))
if hashes.get(file_id):
hashes[file_id].append(full_path)
else:
hashes[file_id] = [full_path]
return hashes
def check_for_duplicates(folder_to_keep, display=False):
folders_to_clean = [folder for folder in os.listdir(
'.') if os.path.isdir(folder)]
folders_to_clean = [
folder for folder in folders_to_clean if folder != folder_to_keep]
print "folders to clean : ", folders_to_clean
hashes_to_keep = create_hashes([folder_to_keep])
print 'files in ', folder_to_keep, len(hashes_to_keep)
hashes_to_clean = create_hashes(folders_to_clean)
# print hashes_to_clean
print 'files in ', folders_to_clean, len(hashes_to_clean)
for file_hash, files_dup in hashes_to_clean.iteritems():
duplicate = hashes_to_keep.get(file_hash, None)
if duplicate:
if display:
print "Duplicate found: %s" % (files_dup)
map(os.remove, files_dup)
print 'remaining files in folders to clean : {}'.format(len(create_hashes(folders_to_clean)))
for folder in folders_to_clean:
recursive_delete_if_empty(folder)
def recursive_delete_if_empty(path):
"""Recursively delete empty directories; return True
if everything was deleted."""
if not os.path.isdir(path):
# If you also want to delete some files like desktop.ini, check
# for that here, and return True if you delete them.
return False
# Note that the list comprehension here is necessary, a
# generator expression would shortcut and we don't want that!
if all([recursive_delete_if_empty(os.path.join(path, filename))
for filename in os.listdir(path)]):
# Either there was nothing here or it was all deleted
os.rmdir(path)
return True
else:
return False
def create_dupes_list(root_path):
root_path=root_path or '.'
folders_to_analyze = [os.path.join(root_path,folder) for folder in os.listdir(
root_path) if os.path.isdir(folder)]
hashes = create_hashes(folders_to_analyze)
pickle.dump(hashes, open("hashes.p", "wb"))
hashes = pickle.load(open("hashes.p", "rb"))
df = pd.DataFrame.from_dict(hashes,orient='index')
df_stacked = pd.DataFrame(df.stack(),columns=['path'])
df_stacked = df_stacked.reset_index(level=0)
df_stacked.rename(columns={'level_0':'hash'}, inplace=True)
df_stacked = df_stacked.sort_values('hash')
df_stacked['count'] =df_stacked.groupby('hash')["path"].transform("count")
df_stacked = df_stacked.sort_values('count',ascending=False)
df_files_paths = df_stacked['path'].str.split('/', 4, expand=True)
df_stacked['file_name']=df_stacked.path.str.extract('([^/]+$)')
df_stacked.drop('path',1,inplace=True)
df_final = pd.concat([df_files_paths, df_stacked], axis=1)
df_final = df_final[df_final['count']>1]
df_final=df_final.sort_values('hash')
df_final.to_csv('duplicates_list.csv')
df_summary = df_final.groupby([0,1])[['hash']].count().sort_values('hash',ascending=False)
df_summary.to_csv('duplicates_list_summary.csv')
if __name__ == '__main__':
try:
folder_to_keep = sys.argv[1]
except:
print 'no arg given !'
create_dupes_list()
#check_for_duplicates(folder_to_keep, True)
@clementlefevre
Copy link
Author

clementlefevre commented Oct 7, 2017

copy .py file in folder of interest.
enter the folder name you want to keep as arg.
the script drops all duplicate files found in the other directories/subdirectories and finally drop all empty folder.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment