jbcrail/find_duplicate_files.py

## find_duplicate_files.py
import hashlib
import os
import sys

from os.path import join, getsize


def sha1(path):
    m = hashlib.sha1()
    with open(path, "r") as f:
        while True:
            data = f.read(1024).encode('utf-8')
            if len(data) == 0:
                break
            m.update(data)
    return m.hexdigest()


def find_duplicates(path):
    # 1) Group all files with same size
    fsizes = {}
    for root, _, files in os.walk(path):
        for name in files:
            filename = join(root, name)
            fsize = getsize(filename)
            if fsize not in fsizes:
                fsizes[fsize] = []
            fsizes[fsize].append(filename)

    # 2) Group all files with same hash
    duplicates = {}
    for key in fsizes:
        if len(fsizes[key]) < 2:
            continue
        for name in fsizes[key]:
            hashid = sha1(name)
            if hashid not in duplicates:
                duplicates[hashid] = []
            duplicates[hashid].append(name)

    # 3) Return groups with duplicates
    return [matches for matches in duplicates.values() if len(matches) > 1]

if __name__ == '__main__':
    print(find_duplicates(sys.argv[1]))
	import hashlib
	import os
	import sys

	from os.path import join, getsize


	def sha1(path):
	m = hashlib.sha1()
	with open(path, "r") as f:
	while True:
	data = f.read(1024).encode('utf-8')
	if len(data) == 0:
	break
	m.update(data)
	return m.hexdigest()


	def find_duplicates(path):
	# 1) Group all files with same size
	fsizes = {}
	for root, _, files in os.walk(path):
	for name in files:
	filename = join(root, name)
	fsize = getsize(filename)
	if fsize not in fsizes:
	fsizes[fsize] = []
	fsizes[fsize].append(filename)

	# 2) Group all files with same hash
	duplicates = {}
	for key in fsizes:
	if len(fsizes[key]) < 2:
	continue
	for name in fsizes[key]:
	hashid = sha1(name)
	if hashid not in duplicates:
	duplicates[hashid] = []
	duplicates[hashid].append(name)

	# 3) Return groups with duplicates
	return [matches for matches in duplicates.values() if len(matches) > 1]

	if __name__ == '__main__':
	print(find_duplicates(sys.argv[1]))