Created
November 18, 2016 18:19
-
-
Save sleibrock/dd11bb387ac5160c06d6c7bbc13e5afa to your computer and use it in GitHub Desktop.
Copy unique files from multiple folders into a new folder
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#-*- coding: utf-8 -*- | |
""" | |
Merge two folders together using a diff digest program | |
If a folders have multiple files of the same digest, | |
ignore the duplicates and proceed to copy the unique files | |
into a new target folder. If the folder doesn't exist, | |
create a new one. | |
This does not preserve file name as files may have | |
overlapping names and thus cannot be preserved | |
""" | |
import shutil | |
import os | |
import os.path | |
from sys import argv | |
from subprocess import getstatusoutput | |
call = lambda p: getstatusoutput(p)[1] | |
sham = "sha256sum" | |
def diff_folders(new_folder, *folders): | |
""" | |
Diff N folders with possibly the same files | |
Apply sha256sum to each file and obtain it's sha256 digest | |
Use a digest => file_path dictionary | |
If a digest exists, possible file duplicate | |
Afterwards, copy all unique files to a new folder | |
""" | |
if not os.path.isdir(new_folder): | |
os.mkdir(new_folder) | |
digests = {} | |
for folder in folders: | |
d_added = 0 | |
files = [os.path.join(folder, f) for f in os.listdir(folder)] | |
print("There are {} files in {}".format(len(files), folder)) | |
for f in files: | |
sha = call("{} {}".format(sham, f)).split(" ")[0] | |
print(sha) | |
if sha not in digests: | |
digests[sha] = f | |
d_added += 1 | |
print("{} digests were added".format(d_added)) | |
print("There are {} pairs in the digest dict".format(len(digests))) | |
for i, f in enumerate(digests.values()): | |
print("Copying {} to new folder...".format(f)) | |
s = "{}" | |
if "." in f: | |
s = s.format("{}."+f.split(".")[-1]) | |
shutil.copy(f, os.path.join(new_folder, s.format(i))) | |
print("done") | |
if __name__ == "__main__": | |
diff_folders(argv[1], *argv[2:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment