Skip to content

Instantly share code, notes, and snippets.

@anjensan
Created November 5, 2015 10:47
Show Gist options
  • Save anjensan/9e23bc39e39fe1c700f2 to your computer and use it in GitHub Desktop.
Save anjensan/9e23bc39e39fe1c700f2 to your computer and use it in GitHub Desktop.
#!/usr/bin/evn python
from __future__ import print_function
import os
import exifread
import pprint
import magic
import hashlib
import datetime
import errno
import collections
DST = "=SORTED="
fs_hashes = {}
stats = collections.defaultdict(int)
def process_file(f):
ff = magic.from_file(f, mime=True)
ffk, fft = ff.split("/")
print()
print("file", f, "mime", ff)
if ffk not in {'image', 'video'}:
print("skip file - unsuported mime")
return
stats['file_' + ffk] += 1
with open(f, 'rb') as fo:
data = fo.read()
h = hashlib.sha1(data).hexdigest()
stats['totalsize'] += len(data)
del data
if h in fs_hashes:
print("hash collision with", fs_hashes[h])
stats['collision'] += 1
return
fs_hashes[h] = f
with open(f, 'rb') as fo:
ei = exifread.process_file(fo, details=True)
idd = ei.get('Image DateTime')
if idd:
idd = parse_datetime(str(idd))
else:
stats['noexif'] += 1
print("no exif datetime - use modification date")
idd = datetime.datetime.fromtimestamp(os.path.getmtime(f))
print("date:", idd)
ext = os.path.splitext(f)[1].lower()
fd = "{r}/{d.year:04}/{d.month:02}/{d.day:02}/{h}{ext}".format(
r=DST,
d=idd,
h=h,
ext=ext,
)
if os.path.exists(fd):
stats['skips'] += 1
print("file {0} already exists - skip".format(fd))
return
mkdir_p(os.path.dirname(fd))
os.link(f, fd)
stats[''] += 1
print(h, "linked")
return
def do_files_scan(rootdir, each_fn):
for root, subdirs, files in os.walk(rootdir):
if root == (os.path.join(rootdir, DST)):
continue
for f in files:
each_fn(root + "/" + f)
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc:
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
DATETIME_FORMATS = [
# precise datetimes
"%Y-%m-%d %H:%M:%S.%f%z",
"%Y-%m-%d %H:%M:%S%z",
"%Y-%m-%dT%H:%M:%S%z",
"%Y-%m-%dT%H:%M:%S%Z",
"%Y-%m-%dT%H:%M:%S+03:00",
"%Y:%m:%d %H:%M:%S",
# ctime
"%a %b %d %H:%M:%S %Y",
# RFC 2822
"%a, %d %b %Y %H:%M:%S +0000",
# common dates
"%Y-%m-%d", "%y-%m-%d",
"%Y%m%d", "%y%m%d",
"%Y_%m_%d", "%y_%m_%d",
"%Y/%m/%d", "%y/%m/%d",
]
def parse_datetime(s):
for fmt in DATETIME_FORMATS:
try:
v = datetime.datetime.strptime(s, fmt)
except ValueError:
pass
else:
return v
raise ValueError("invalid date", s)
def main():
do_files_scan(os.getcwd(), process_file)
pprint.pprint(stats)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment