Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Find file duplicates inside a directory with python.
"""
Find file duplicates inside a directory through sha1 hashing
of file contents.
Uses early discard of similar files by checking first its size, and
optionally by hashing only the first part of the file
"""
from __future__ import print_function
import os
import os.path
from hashlib import sha1
from collections import defaultdict
def partial_file_hash(filepath, num_bytes, block_size=65536):
"""
Given a file, hashes the first num_bytes of the file,
bytes and returns the hash from that block.
"""
hasher = sha1()
with open(filepath, 'rb') as file_to_hash:
while num_bytes > 0:
size = min(num_bytes, block_size)
num_bytes = num_bytes - size
file_chunk = file_to_hash.read(size)
hasher.update(file_chunk)
return hasher.hexdigest()
def group_files_by_size(dir_to_scan):
"""
Creates a dictionary, with the file size as key,
and the values a list of tuples of (filepath, size).
"""
by_size = defaultdict(list)
for fname in os.listdir(dir_to_scan):
fpath = os.path.join(dir_to_scan, fname)
size = os.stat(fpath).st_size
by_size[size].append((fpath, size))
return by_size
def split_files_by_hash(file_and_size_list, block_size, partial_hash=False):
"""
Split a list of (filepath, size) tuples, in different lists that
share the same hash.
The block_size param is used to limit the amount of in-memory data
that is read while iterating through the file.
The partial_hash param set to True will only hash the first block_size
bytes of the file (that can be useful for early discard of really big
files).
"""
by_hash = defaultdict(list)
for fname, size in file_and_size_list:
if partial_hash:
num_bytes = min(size, block_size)
else:
num_bytes = size
hash_string = partial_file_hash(fname, num_bytes, block_size)
by_hash[hash_string].append((fname, size))
return [v for v in by_hash.values()]
def split_groups_by_hash(file_and_size_groups, block_size, partial_hash=False):
"""
Given a list of list (groups) of (filepath, size) tuples, it splits the
groups further by different content hash result, and returns the
new list of list.
The partial_hash param set to True will only hash the first block_size
bytes of the file (that can be useful for early discard of really big
files).
"""
all_groups = []
for file_and_size_list in file_and_size_groups:
by_hash = split_files_by_hash(file_and_size_list, block_size, partial_hash)
all_groups.extend(by_hash)
return all_groups
def find_duplicates(dir_to_scan, block_size=65536, use_partial_hash=False):
"""
Give a valid directory, it lists the files inside the directory
(only direct childs of the directory) and return a list of list (groups)
of filenames that have the same content.
The use_partial_hash parameter=False to avoid the step of first checking
only the hash of the first block_size bytes (tryng to avoid a full file
hash for big files).
"""
files_by_size = group_files_by_size(dir_to_scan)
candidates = [v for v in files_by_size.values() if len(v) > 1]
if use_partial_hash:
candidates = split_groups_by_hash(candidates, block_size, partial_hash=True)
candidates = [v for v in candidates if len(v) > 1]
candidates = split_groups_by_hash(candidates, block_size, partial_hash=False)
candidates = [v for v in candidates if len(v) > 1]
# convert back to simple file names, without the directory part:
duplicates = []
for c_group in candidates:
duplicates.append([os.path.split(fpath)[1] for fpath, _ in c_group])
return duplicates
def execute_from_args(args):
"""
Executes the find duplicates from arguments passed from commandline.
"""
if len(args) < 2:
print("Usage: python find_duplicates.py <dir_to_scan> [grouped] [partial]")
else:
dir_to_scan = args[1]
if not os.path.isdir(dir_to_scan):
print("Invalid directory: {}".format(dir_to_scan))
else:
partial_hash = bool('partial' not in args)
grouped = bool('grouped' in args)
dup_groups = find_duplicates(dir_to_scan, use_partial_hash=partial_hash)
for duplicates in dup_groups:
if grouped:
print(duplicates)
else:
for file_name in duplicates:
print(file_name)
if __name__ == '__main__':
execute_from_args(os.sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.