alicefuzier/get_miro_ids.py

## get_miro_ids.py
import json
from elasticsearch import Elasticsearch

es = Elasticsearch(['https://user:password@host:443/'])

page = es.search(
  index = 'indexname',
  doc_type = 'work',
  scroll = '2m',
  size = 1000,
  body = {
    "query" : {
        "bool": {
          "must": [
            {}
          ],
          "filter": [
          {"term" : { "type" : "IdentifiedWork" }},
          {"terms" : { "items.agent.locations.locationType.id" : ["iiif-image"] }},
          {"terms" : { "workType.id" : ["q","k"] }}
        ]
        }
    }
})
sid = page['_scroll_id']
scroll_size = page['hits']['total']
file = "v2-miro-ids.txt"

def extract_miro_id(identifiers):
    for identifier in identifiers:
        if identifier['identifierType']['id'] == 'miro-image-number':
            id = identifier['value']
    if not id:
        raise Exception("there is no miro id!")
    return id

def get_miro_ids(page):
    return [extract_miro_id([hit['_source']['sourceIdentifier']] + hit['_source']['otherIdentifiers']) for hit in page['hits']['hits']]

with open(file,"a") as f:
    miro_ids = get_miro_ids(page)
    f.write(json.dumps(miro_ids)+"\n")
    while (scroll_size > 0):
        print("Scrolling...")
        page = es.scroll(scroll_id = sid, scroll = '2m')
        sid = page['_scroll_id']
        scroll_size = len(page['hits']['hits'])
        miro_ids = get_miro_ids(page)
        f.write(json.dumps(miro_ids)+"\n")
print("Done!")
	import json
	from elasticsearch import Elasticsearch

	es = Elasticsearch(['https://user:password@host:443/'])

	page = es.search(
	index = 'indexname',
	doc_type = 'work',
	scroll = '2m',
	size = 1000,
	body = {
	"query" : {
	"bool": {
	"must": [
	{}
	],
	"filter": [
	{"term" : { "type" : "IdentifiedWork" }},
	{"terms" : { "items.agent.locations.locationType.id" : ["iiif-image"] }},
	{"terms" : { "workType.id" : ["q","k"] }}
	]
	}
	}
	})
	sid = page['_scroll_id']
	scroll_size = page['hits']['total']
	file = "v2-miro-ids.txt"

	def extract_miro_id(identifiers):
	for identifier in identifiers:
	if identifier['identifierType']['id'] == 'miro-image-number':
	id = identifier['value']
	if not id:
	raise Exception("there is no miro id!")
	return id

	def get_miro_ids(page):
	return [extract_miro_id([hit['_source']['sourceIdentifier']] + hit['_source']['otherIdentifiers']) for hit in page['hits']['hits']]

	with open(file,"a") as f:
	miro_ids = get_miro_ids(page)
	f.write(json.dumps(miro_ids)+"\n")
	while (scroll_size > 0):
	print("Scrolling...")
	page = es.scroll(scroll_id = sid, scroll = '2m')
	sid = page['_scroll_id']
	scroll_size = len(page['hits']['hits'])
	miro_ids = get_miro_ids(page)
	f.write(json.dumps(miro_ids)+"\n")
	print("Done!")