Skip to content

Instantly share code, notes, and snippets.

@duhaime
Last active May 26, 2018 10:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save duhaime/2bbe4fb77f94b6f951dbb409cc469a23 to your computer and use it in GitHub Desktop.
Save duhaime/2bbe4fb77f94b6f951dbb409cc469a23 to your computer and use it in GitHub Desktop.
Find image reprints
import json
import os
from itertools import combinations
from pymongo import MongoClient
from collections import deque
def group_lists(list_of_lists):
'''Combine all items in list_of_lists that share
an element recursively, until no two share an item'''
queue = deque( sorted(list_of_lists, key=lambda x:min(x)) )
grouped = []
while len(queue) >= 2:
print('queue length', len(queue))
s1 = set(queue.popleft())
s2 = set(queue.popleft())
if s1 & s2:
queue.appendleft(s1 | s2)
else:
grouped.append(s1)
queue.appendleft(s2)
if queue:
grouped.append(queue.pop())
return [list(i) for i in grouped]
def get_image_id_to_type():
'''Return a mapping from image id to image type'''
print(' * fetching metadata')
db = MongoClient().eea
# map each image id to its image type
image_id_to_type = {}
query = {'images': {'$exists': True, '$ne': []}}
projection = {'images': 1, '_id': 0}
records = list(db.ecco.find(query, projection))
for record in records:
for image in record['images']:
image_id_to_type[image['image_id'].replace('.TIF', '')] = image['type']
return image_id_to_type
def get_match_groups():
'''Return a list of lists where each sublist indicates a list
of records that are sufficiently similar'''
# build a datastructure of found matches
match_groups = []
f = json.load(open('results.json'))
for idx, i in enumerate(f):
print(' * processed', idx, 'of', len(f), 'images')
match_set = set()
image_id = os.path.basename(i).replace('.jpg', '')
record_id = ''.join(image_id[:10])
image_type = image_id_to_type[image_id]
if image_type == 'music':
continue
for match in f[i]:
dist = match['dist']
match_path = match['path']
match_image_id = os.path.basename(match_path).replace('.jpg', '')
match_type = image_id_to_type[match_image_id]
if match_type == 'music':
continue
match_record_id = ''.join(match_image_id[:10])
# distance of 0 = identical file
if (dist < 0.4) and (dist > 0) and (record_id != match_record_id):
match_set.add(match_path)
match_set.add(i)
if list(match_set):
match_groups.append(list(match_set))
return match_groups
image_id_to_type = get_image_id_to_type()
match_groups = get_match_groups()
clustered = group_lists(match_groups)
with open('groups.json', 'w') as out:
json.dump(list(clustered), out)
from elasticsearch import Elasticsearch
from image_match.elasticsearch_driver import SignatureES
from multiprocessing import Pool
from glob2 import glob
import json
imgs = glob('../copyright/data/ecco/images/originals/**/*.jpg')
cores = 32
es = Elasticsearch()
ses = SignatureES(es, timeout='120s', distance_cutoff=0.4)
def index(path):
'''Add an image to the index'''
try:
ses.add_image(path)
except Exception as exc:
print(' ! error processing', path)
def query(path):
'''Query for an image'''
try:
return [path, ses.search_image(path, all_orientations=True)]
except Exception as exc:
print(' ! error querying for', path, exc)
pool = Pool(cores)
for idx, r in enumerate(pool.imap(index, imgs)):
print(' * processed', idx+1, 'images')
pool.close()
pool.join()
pool = Pool(cores)
results = {}
for idx, r in enumerate(pool.imap(query, imgs)):
print(' * ran', idx+1, 'queries')
path, matches = r
results[path] = matches
with open('results.json', 'w') as out:
json.dump(results, out)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment