Skip to content

Instantly share code, notes, and snippets.

@sleibrock
Created November 18, 2016 18:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sleibrock/dd11bb387ac5160c06d6c7bbc13e5afa to your computer and use it in GitHub Desktop.
Save sleibrock/dd11bb387ac5160c06d6c7bbc13e5afa to your computer and use it in GitHub Desktop.
Copy unique files from multiple folders into a new folder
#!/usr/bin/env python
#-*- coding: utf-8 -*-
"""
Merge two folders together using a diff digest program
If a folders have multiple files of the same digest,
ignore the duplicates and proceed to copy the unique files
into a new target folder. If the folder doesn't exist,
create a new one.
This does not preserve file name as files may have
overlapping names and thus cannot be preserved
"""
import shutil
import os
import os.path
from sys import argv
from subprocess import getstatusoutput
call = lambda p: getstatusoutput(p)[1]
sham = "sha256sum"
def diff_folders(new_folder, *folders):
"""
Diff N folders with possibly the same files
Apply sha256sum to each file and obtain it's sha256 digest
Use a digest => file_path dictionary
If a digest exists, possible file duplicate
Afterwards, copy all unique files to a new folder
"""
if not os.path.isdir(new_folder):
os.mkdir(new_folder)
digests = {}
for folder in folders:
d_added = 0
files = [os.path.join(folder, f) for f in os.listdir(folder)]
print("There are {} files in {}".format(len(files), folder))
for f in files:
sha = call("{} {}".format(sham, f)).split(" ")[0]
print(sha)
if sha not in digests:
digests[sha] = f
d_added += 1
print("{} digests were added".format(d_added))
print("There are {} pairs in the digest dict".format(len(digests)))
for i, f in enumerate(digests.values()):
print("Copying {} to new folder...".format(f))
s = "{}"
if "." in f:
s = s.format("{}."+f.split(".")[-1])
shutil.copy(f, os.path.join(new_folder, s.format(i)))
print("done")
if __name__ == "__main__":
diff_folders(argv[1], *argv[2:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment