Skip to content

Instantly share code, notes, and snippets.

@SapphicCode
Last active September 4, 2021 12:37
Show Gist options
  • Save SapphicCode/a516e29da1133a1c87405bf91c121808 to your computer and use it in GitHub Desktop.
Save SapphicCode/a516e29da1133a1c87405bf91c121808 to your computer and use it in GitHub Desktop.
A script to ingest recurring archives from various service exports
import os
import sys
import hashlib
import json
algorithm = 'sha3_256'
file_map = {}
try:
with open(os.path.join(algorithm, 'digests.json'), 'r') as f:
file_map = json.load(f)
except FileNotFoundError:
pass
try:
for directory, _, files in os.walk(sys.argv[1]):
for file in files:
# hash file
hash = getattr(hashlib, algorithm)()
file_path = os.path.join(directory, file)
with open(file_path, 'rb') as f:
data = f.read(4 * 1024 * 1024)
hash.update(data)
# Windows support
file_path = file_path.replace("\\", "/")
# strip relative path
if file_path.startswith("./"):
file_path = file_path[2:]
# dump into map
digest = hash.hexdigest()
file_paths = file_map.get(digest, [])
if file_path not in file_paths:
file_paths.append(file_path)
file_map[digest] = file_paths
# move to hash directory
try:
digest_path = os.path.join(algorithm, digest[:2])
digest_file_path = os.path.join(digest_path, f'{digest}.bin')
os.makedirs(digest_path, exist_ok=True)
os.rename(file_path, digest_file_path)
except FileExistsError:
print("duplicated file encountered. discarding.")
os.remove(file_path)
except AttributeError as e:
raise e
except Exception as e:
print(e)
with open(os.path.join(algorithm, 'digests.json'), 'w') as f:
json.dump(file_map, f, indent=2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment