Created
November 5, 2015 10:47
-
-
Save anjensan/9e23bc39e39fe1c700f2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/evn python | |
from __future__ import print_function | |
import os | |
import exifread | |
import pprint | |
import magic | |
import hashlib | |
import datetime | |
import errno | |
import collections | |
DST = "=SORTED=" | |
fs_hashes = {} | |
stats = collections.defaultdict(int) | |
def process_file(f): | |
ff = magic.from_file(f, mime=True) | |
ffk, fft = ff.split("/") | |
print() | |
print("file", f, "mime", ff) | |
if ffk not in {'image', 'video'}: | |
print("skip file - unsuported mime") | |
return | |
stats['file_' + ffk] += 1 | |
with open(f, 'rb') as fo: | |
data = fo.read() | |
h = hashlib.sha1(data).hexdigest() | |
stats['totalsize'] += len(data) | |
del data | |
if h in fs_hashes: | |
print("hash collision with", fs_hashes[h]) | |
stats['collision'] += 1 | |
return | |
fs_hashes[h] = f | |
with open(f, 'rb') as fo: | |
ei = exifread.process_file(fo, details=True) | |
idd = ei.get('Image DateTime') | |
if idd: | |
idd = parse_datetime(str(idd)) | |
else: | |
stats['noexif'] += 1 | |
print("no exif datetime - use modification date") | |
idd = datetime.datetime.fromtimestamp(os.path.getmtime(f)) | |
print("date:", idd) | |
ext = os.path.splitext(f)[1].lower() | |
fd = "{r}/{d.year:04}/{d.month:02}/{d.day:02}/{h}{ext}".format( | |
r=DST, | |
d=idd, | |
h=h, | |
ext=ext, | |
) | |
if os.path.exists(fd): | |
stats['skips'] += 1 | |
print("file {0} already exists - skip".format(fd)) | |
return | |
mkdir_p(os.path.dirname(fd)) | |
os.link(f, fd) | |
stats[''] += 1 | |
print(h, "linked") | |
return | |
def do_files_scan(rootdir, each_fn): | |
for root, subdirs, files in os.walk(rootdir): | |
if root == (os.path.join(rootdir, DST)): | |
continue | |
for f in files: | |
each_fn(root + "/" + f) | |
def mkdir_p(path): | |
try: | |
os.makedirs(path) | |
except OSError as exc: | |
if exc.errno == errno.EEXIST and os.path.isdir(path): | |
pass | |
else: | |
raise | |
DATETIME_FORMATS = [ | |
# precise datetimes | |
"%Y-%m-%d %H:%M:%S.%f%z", | |
"%Y-%m-%d %H:%M:%S%z", | |
"%Y-%m-%dT%H:%M:%S%z", | |
"%Y-%m-%dT%H:%M:%S%Z", | |
"%Y-%m-%dT%H:%M:%S+03:00", | |
"%Y:%m:%d %H:%M:%S", | |
# ctime | |
"%a %b %d %H:%M:%S %Y", | |
# RFC 2822 | |
"%a, %d %b %Y %H:%M:%S +0000", | |
# common dates | |
"%Y-%m-%d", "%y-%m-%d", | |
"%Y%m%d", "%y%m%d", | |
"%Y_%m_%d", "%y_%m_%d", | |
"%Y/%m/%d", "%y/%m/%d", | |
] | |
def parse_datetime(s): | |
for fmt in DATETIME_FORMATS: | |
try: | |
v = datetime.datetime.strptime(s, fmt) | |
except ValueError: | |
pass | |
else: | |
return v | |
raise ValueError("invalid date", s) | |
def main(): | |
do_files_scan(os.getcwd(), process_file) | |
pprint.pprint(stats) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment