Skip to content

Instantly share code, notes, and snippets.

@honzabrecka
Created February 22, 2019 13:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save honzabrecka/cf97a18bd9fe55cd6150514d07ae0a0d to your computer and use it in GitHub Desktop.
Save honzabrecka/cf97a18bd9fe55cd6150514d07ae0a0d to your computer and use it in GitHub Desktop.
import hashlib
import os
from os.path import exists, isfile, join, basename
import shutil
def md5(file):
hash_md5 = hashlib.md5()
with open(file, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def listDir(dir):
files = []
for f in os.listdir(dir):
file = join(dir, f)
if isfile(file) and file.endswith(".jpg"):
files.append(file)
return files
def hashes(files):
hashes = {}
for file in files:
hash = md5(file)
if hash in hashes:
hashes[hash].append(file)
else:
hashes[hash] = [file]
return hashes
def dedup(hashes, dest):
for hash, files in hashes.items():
shutil.copy2(files[0], join(dest, basename(files[0])))
def main(source, dest):
if not exists(dest):
os.makedirs(dest)
dedup(hashes(listDir(source)), dest)
if __name__ == '__main__':
# change paths
main("/Users/jx/Work/yachting/dedup/data", "/Users/jx/Work/yachting/dedup/x")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment