Last active
September 28, 2019 10:52
-
-
Save pavelkryukov/15f93d19a99428a284a8bcec27e0187b to your computer and use it in GitHub Desktop.
This script finds local versions of images fetched by InstaLoader.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# Copyright (C) Pavel Kryukov 2017 | |
from PIL import Image | |
import imagehash | |
import os | |
import shelve | |
import shutil | |
import re | |
import argparse | |
from tqdm import tqdm | |
def is_image_filename(f): | |
return os.path.splitext(f)[1] in ['.jpg', '.JPG', '.jpeg', '.JPEG', '.jpe', '.JPE'] | |
def update_database(path, db_name): | |
sources = [os.path.join(dp, f) for dp, dn, filenames in os.walk(path) for f in filenames if is_image_filename(f)] | |
try: | |
db = shelve.open(db_name, writeback = True) | |
except IOError as e: | |
print "Unable to open database: " + str(e) | |
raise e | |
if len(db) is 0: | |
print "Warning: empty database" | |
for filename in tqdm(sources): | |
if filename in db: continue | |
try: | |
image = Image.open(filename) | |
except IOError as e: | |
print "Unable to open file: " + str(e) | |
continue | |
try: | |
h = imagehash.dhash(image) | |
except IOError as e: | |
print "Corrupted image file: " + str(e) | |
continue | |
db[filename] = h # TODO pkryukov: this should be indexed by md5 of file | |
db.close | |
def lookup_database(path, db_name): | |
suffix = '_better' | |
instagrams = [os.path.join(dp, f) for dp, dn, filenames in os.walk(path) for f in filenames if is_image_filename(f) and suffix not in f] | |
try: | |
db = shelve.open(db_name) | |
except IOError as e: | |
print "Unable to open database: " + str(e) | |
raise e | |
for filename in tqdm(instagrams): | |
better_file = re.sub(".jpg", suffix + ".jpg", filename) | |
if os.path.isfile(better_file): | |
continue | |
image = Image.open(filename) | |
h = imagehash.dhash(image) | |
best = min(db, key = lambda(x) : db.get(x) - h) | |
# Consider images as equal if their hash are different no more than by 4 bits | |
# However, it can lead to funny collisions :-) | |
if db.get(best) - h < 5 and os.path.isfile(best): | |
shutil.copy2(best, better_file) | |
def main(): | |
ap = argparse.ArgumentParser() | |
ap.add_argument("-s", "--sources", required = True, help = "path to local images") | |
ap.add_argument("-t", "--targets", required = True, help = "path to fetched images") | |
ap.add_argument("-d", "--database", required = True, help = "shelve database file") | |
args = vars(ap.parse_args()) | |
update_database(args["sources"], args["database"]) | |
lookup_database(args["targets"], args["database"]) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment