Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Find file duplicates inside a directory with python.
Find file duplicates inside a directory through sha1 hashing
of file contents.
Uses early discard of similar files by checking first its size, and
optionally by hashing only the first part of the file
from __future__ import print_function
import os
import os.path
from hashlib import sha1
from collections import defaultdict
def partial_file_hash(filepath, num_bytes, block_size=65536):
Given a file, hashes the first num_bytes of the file,
bytes and returns the hash from that block.
hasher = sha1()
with open(filepath, 'rb') as file_to_hash:
while num_bytes > 0:
size = min(num_bytes, block_size)
num_bytes = num_bytes - size
file_chunk =
return hasher.hexdigest()
def group_files_by_size(dir_to_scan):
Creates a dictionary, with the file size as key,
and the values a list of tuples of (filepath, size).
by_size = defaultdict(list)
for fname in os.listdir(dir_to_scan):
fpath = os.path.join(dir_to_scan, fname)
size = os.stat(fpath).st_size
by_size[size].append((fpath, size))
return by_size
def split_files_by_hash(file_and_size_list, block_size, partial_hash=False):
Split a list of (filepath, size) tuples, in different lists that
share the same hash.
The block_size param is used to limit the amount of in-memory data
that is read while iterating through the file.
The partial_hash param set to True will only hash the first block_size
bytes of the file (that can be useful for early discard of really big
by_hash = defaultdict(list)
for fname, size in file_and_size_list:
if partial_hash:
num_bytes = min(size, block_size)
num_bytes = size
hash_string = partial_file_hash(fname, num_bytes, block_size)
by_hash[hash_string].append((fname, size))
return [v for v in by_hash.values()]
def split_groups_by_hash(file_and_size_groups, block_size, partial_hash=False):
Given a list of list (groups) of (filepath, size) tuples, it splits the
groups further by different content hash result, and returns the
new list of list.
The partial_hash param set to True will only hash the first block_size
bytes of the file (that can be useful for early discard of really big
all_groups = []
for file_and_size_list in file_and_size_groups:
by_hash = split_files_by_hash(file_and_size_list, block_size, partial_hash)
return all_groups
def find_duplicates(dir_to_scan, block_size=65536, use_partial_hash=False):
Give a valid directory, it lists the files inside the directory
(only direct childs of the directory) and return a list of list (groups)
of filenames that have the same content.
The use_partial_hash parameter=False to avoid the step of first checking
only the hash of the first block_size bytes (tryng to avoid a full file
hash for big files).
files_by_size = group_files_by_size(dir_to_scan)
candidates = [v for v in files_by_size.values() if len(v) > 1]
if use_partial_hash:
candidates = split_groups_by_hash(candidates, block_size, partial_hash=True)
candidates = [v for v in candidates if len(v) > 1]
candidates = split_groups_by_hash(candidates, block_size, partial_hash=False)
candidates = [v for v in candidates if len(v) > 1]
# convert back to simple file names, without the directory part:
duplicates = []
for c_group in candidates:
duplicates.append([os.path.split(fpath)[1] for fpath, _ in c_group])
return duplicates
def execute_from_args(args):
Executes the find duplicates from arguments passed from commandline.
if len(args) < 2:
print("Usage: python <dir_to_scan> [grouped] [partial]")
dir_to_scan = args[1]
if not os.path.isdir(dir_to_scan):
print("Invalid directory: {}".format(dir_to_scan))
partial_hash = bool('partial' not in args)
grouped = bool('grouped' in args)
dup_groups = find_duplicates(dir_to_scan, use_partial_hash=partial_hash)
for duplicates in dup_groups:
if grouped:
for file_name in duplicates:
if __name__ == '__main__':
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.