Last active
May 26, 2018 10:39
-
-
Save duhaime/2bbe4fb77f94b6f951dbb409cc469a23 to your computer and use it in GitHub Desktop.
Find image reprints
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import os | |
from itertools import combinations | |
from pymongo import MongoClient | |
from collections import deque | |
def group_lists(list_of_lists): | |
'''Combine all items in list_of_lists that share | |
an element recursively, until no two share an item''' | |
queue = deque( sorted(list_of_lists, key=lambda x:min(x)) ) | |
grouped = [] | |
while len(queue) >= 2: | |
print('queue length', len(queue)) | |
s1 = set(queue.popleft()) | |
s2 = set(queue.popleft()) | |
if s1 & s2: | |
queue.appendleft(s1 | s2) | |
else: | |
grouped.append(s1) | |
queue.appendleft(s2) | |
if queue: | |
grouped.append(queue.pop()) | |
return [list(i) for i in grouped] | |
def get_image_id_to_type(): | |
'''Return a mapping from image id to image type''' | |
print(' * fetching metadata') | |
db = MongoClient().eea | |
# map each image id to its image type | |
image_id_to_type = {} | |
query = {'images': {'$exists': True, '$ne': []}} | |
projection = {'images': 1, '_id': 0} | |
records = list(db.ecco.find(query, projection)) | |
for record in records: | |
for image in record['images']: | |
image_id_to_type[image['image_id'].replace('.TIF', '')] = image['type'] | |
return image_id_to_type | |
def get_match_groups(): | |
'''Return a list of lists where each sublist indicates a list | |
of records that are sufficiently similar''' | |
# build a datastructure of found matches | |
match_groups = [] | |
f = json.load(open('results.json')) | |
for idx, i in enumerate(f): | |
print(' * processed', idx, 'of', len(f), 'images') | |
match_set = set() | |
image_id = os.path.basename(i).replace('.jpg', '') | |
record_id = ''.join(image_id[:10]) | |
image_type = image_id_to_type[image_id] | |
if image_type == 'music': | |
continue | |
for match in f[i]: | |
dist = match['dist'] | |
match_path = match['path'] | |
match_image_id = os.path.basename(match_path).replace('.jpg', '') | |
match_type = image_id_to_type[match_image_id] | |
if match_type == 'music': | |
continue | |
match_record_id = ''.join(match_image_id[:10]) | |
# distance of 0 = identical file | |
if (dist < 0.4) and (dist > 0) and (record_id != match_record_id): | |
match_set.add(match_path) | |
match_set.add(i) | |
if list(match_set): | |
match_groups.append(list(match_set)) | |
return match_groups | |
image_id_to_type = get_image_id_to_type() | |
match_groups = get_match_groups() | |
clustered = group_lists(match_groups) | |
with open('groups.json', 'w') as out: | |
json.dump(list(clustered), out) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from elasticsearch import Elasticsearch | |
from image_match.elasticsearch_driver import SignatureES | |
from multiprocessing import Pool | |
from glob2 import glob | |
import json | |
imgs = glob('../copyright/data/ecco/images/originals/**/*.jpg') | |
cores = 32 | |
es = Elasticsearch() | |
ses = SignatureES(es, timeout='120s', distance_cutoff=0.4) | |
def index(path): | |
'''Add an image to the index''' | |
try: | |
ses.add_image(path) | |
except Exception as exc: | |
print(' ! error processing', path) | |
def query(path): | |
'''Query for an image''' | |
try: | |
return [path, ses.search_image(path, all_orientations=True)] | |
except Exception as exc: | |
print(' ! error querying for', path, exc) | |
pool = Pool(cores) | |
for idx, r in enumerate(pool.imap(index, imgs)): | |
print(' * processed', idx+1, 'images') | |
pool.close() | |
pool.join() | |
pool = Pool(cores) | |
results = {} | |
for idx, r in enumerate(pool.imap(query, imgs)): | |
print(' * ran', idx+1, 'queries') | |
path, matches = r | |
results[path] = matches | |
with open('results.json', 'w') as out: | |
json.dump(results, out) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment