duhaime/cluster_found_pairs.py

## cluster_found_pairs.py
import json
import os
from itertools import combinations
from pymongo import MongoClient
from collections import deque

def group_lists(list_of_lists):
  '''Combine all items in list_of_lists that share
  an element recursively, until no two share an item'''
  queue = deque( sorted(list_of_lists, key=lambda x:min(x)) )
  grouped = []
  while len(queue) >= 2:
    print('queue length', len(queue))
    s1 = set(queue.popleft())
    s2 = set(queue.popleft())
    if s1 & s2:
      queue.appendleft(s1 | s2)
    else:
        grouped.append(s1)
        queue.appendleft(s2)
  if queue:
    grouped.append(queue.pop())
  return [list(i) for i in grouped]


def get_image_id_to_type():
  '''Return a mapping from image id to image type'''
  print(' * fetching metadata')
  db = MongoClient().eea

  # map each image id to its image type
  image_id_to_type = {}
  query = {'images': {'$exists': True, '$ne': []}}
  projection = {'images': 1, '_id': 0}
  records = list(db.ecco.find(query, projection))
  for record in records:
    for image in record['images']:
      image_id_to_type[image['image_id'].replace('.TIF', '')] = image['type']
  return image_id_to_type


def get_match_groups():
  '''Return a list of lists where each sublist indicates a list
  of records that are sufficiently similar'''
  # build a datastructure of found matches
  match_groups = []
  f = json.load(open('results.json'))
  for idx, i in enumerate(f):
    print(' * processed', idx, 'of', len(f), 'images')
    match_set = set()
    image_id = os.path.basename(i).replace('.jpg', '')
    record_id = ''.join(image_id[:10])
    image_type = image_id_to_type[image_id]
    if image_type == 'music':
      continue
    for match in f[i]:
      dist = match['dist']
      match_path = match['path']
      match_image_id = os.path.basename(match_path).replace('.jpg', '')
      match_type = image_id_to_type[match_image_id]
      if match_type == 'music':
        continue
      match_record_id = ''.join(match_image_id[:10])
      # distance of 0 = identical file
      if (dist < 0.4) and (dist > 0) and (record_id != match_record_id):
        match_set.add(match_path)
        match_set.add(i)
    if list(match_set):
      match_groups.append(list(match_set))
  return match_groups


image_id_to_type = get_image_id_to_type()
match_groups = get_match_groups()
clustered = group_lists(match_groups)

with open('groups.json', 'w') as out:
  json.dump(list(clustered), out)

## find_image_reprints.py
from elasticsearch import Elasticsearch
from image_match.elasticsearch_driver import SignatureES
from multiprocessing import Pool
from glob2 import glob
import json

imgs = glob('../copyright/data/ecco/images/originals/**/*.jpg')
cores = 32
es = Elasticsearch()
ses = SignatureES(es, timeout='120s', distance_cutoff=0.4)

def index(path):
  '''Add an image to the index'''
  try:
    ses.add_image(path)
  except Exception as exc:
    print(' ! error processing', path)

def query(path):
  '''Query for an image'''
  try:
    return [path, ses.search_image(path, all_orientations=True)]
  except Exception as exc:
    print(' ! error querying for', path, exc)

pool = Pool(cores)
for idx, r in enumerate(pool.imap(index, imgs)):
  print(' * processed', idx+1, 'images')
pool.close()
pool.join()

pool = Pool(cores)
results = {}
for idx, r in enumerate(pool.imap(query, imgs)):
  print(' * ran', idx+1, 'queries')
  path, matches = r
  results[path] = matches

with open('results.json', 'w') as out:
  json.dump(results, out)
	import json
	import os
	from itertools import combinations
	from pymongo import MongoClient
	from collections import deque

	def group_lists(list_of_lists):
	'''Combine all items in list_of_lists that share
	an element recursively, until no two share an item'''
	queue = deque( sorted(list_of_lists, key=lambda x:min(x)) )
	grouped = []
	while len(queue) >= 2:
	print('queue length', len(queue))
	s1 = set(queue.popleft())
	s2 = set(queue.popleft())
	if s1 & s2:
	queue.appendleft(s1 \| s2)
	else:
	grouped.append(s1)
	queue.appendleft(s2)
	if queue:
	grouped.append(queue.pop())
	return [list(i) for i in grouped]


	def get_image_id_to_type():
	'''Return a mapping from image id to image type'''
	print(' * fetching metadata')
	db = MongoClient().eea

	# map each image id to its image type
	image_id_to_type = {}
	query = {'images': {'$exists': True, '$ne': []}}
	projection = {'images': 1, '_id': 0}
	records = list(db.ecco.find(query, projection))
	for record in records:
	for image in record['images']:
	image_id_to_type[image['image_id'].replace('.TIF', '')] = image['type']
	return image_id_to_type


	def get_match_groups():
	'''Return a list of lists where each sublist indicates a list
	of records that are sufficiently similar'''
	# build a datastructure of found matches
	match_groups = []
	f = json.load(open('results.json'))
	for idx, i in enumerate(f):
	print(' * processed', idx, 'of', len(f), 'images')
	match_set = set()
	image_id = os.path.basename(i).replace('.jpg', '')
	record_id = ''.join(image_id[:10])
	image_type = image_id_to_type[image_id]
	if image_type == 'music':
	continue
	for match in f[i]:
	dist = match['dist']
	match_path = match['path']
	match_image_id = os.path.basename(match_path).replace('.jpg', '')
	match_type = image_id_to_type[match_image_id]
	if match_type == 'music':
	continue
	match_record_id = ''.join(match_image_id[:10])
	# distance of 0 = identical file
	if (dist < 0.4) and (dist > 0) and (record_id != match_record_id):
	match_set.add(match_path)
	match_set.add(i)
	if list(match_set):
	match_groups.append(list(match_set))
	return match_groups


	image_id_to_type = get_image_id_to_type()
	match_groups = get_match_groups()
	clustered = group_lists(match_groups)

	with open('groups.json', 'w') as out:
	json.dump(list(clustered), out)
	from elasticsearch import Elasticsearch
	from image_match.elasticsearch_driver import SignatureES
	from multiprocessing import Pool
	from glob2 import glob
	import json

	imgs = glob('../copyright/data/ecco/images/originals/*/.jpg')
	cores = 32
	es = Elasticsearch()
	ses = SignatureES(es, timeout='120s', distance_cutoff=0.4)

	def index(path):
	'''Add an image to the index'''
	try:
	ses.add_image(path)
	except Exception as exc:
	print(' ! error processing', path)

	def query(path):
	'''Query for an image'''
	try:
	return [path, ses.search_image(path, all_orientations=True)]
	except Exception as exc:
	print(' ! error querying for', path, exc)

	pool = Pool(cores)
	for idx, r in enumerate(pool.imap(index, imgs)):
	print(' * processed', idx+1, 'images')
	pool.close()
	pool.join()

	pool = Pool(cores)
	results = {}
	for idx, r in enumerate(pool.imap(query, imgs)):
	print(' * ran', idx+1, 'queries')
	path, matches = r
	results[path] = matches

	with open('results.json', 'w') as out:
	json.dump(results, out)