Skip to content

Instantly share code, notes, and snippets.

@alicefuzier
Last active November 27, 2018 13:09
Show Gist options
  • Save alicefuzier/964ed4cd9257cb3058a28a197f310339 to your computer and use it in GitHub Desktop.
Save alicefuzier/964ed4cd9257cb3058a28a197f310339 to your computer and use it in GitHub Desktop.
import json
from elasticsearch import Elasticsearch
es = Elasticsearch(['https://user:password@host:443/'])
page = es.search(
index = 'indexname',
doc_type = 'work',
scroll = '2m',
size = 1000,
body = {
"query" : {
"bool": {
"must": [
{}
],
"filter": [
{"term" : { "type" : "IdentifiedWork" }},
{"terms" : { "items.agent.locations.locationType.id" : ["iiif-image"] }},
{"terms" : { "workType.id" : ["q","k"] }}
]
}
}
})
sid = page['_scroll_id']
scroll_size = page['hits']['total']
file = "v2-miro-ids.txt"
def extract_miro_id(identifiers):
for identifier in identifiers:
if identifier['identifierType']['id'] == 'miro-image-number':
id = identifier['value']
if not id:
raise Exception("there is no miro id!")
return id
def get_miro_ids(page):
return [extract_miro_id([hit['_source']['sourceIdentifier']] + hit['_source']['otherIdentifiers']) for hit in page['hits']['hits']]
with open(file,"a") as f:
miro_ids = get_miro_ids(page)
f.write(json.dumps(miro_ids)+"\n")
while (scroll_size > 0):
print("Scrolling...")
page = es.scroll(scroll_id = sid, scroll = '2m')
sid = page['_scroll_id']
scroll_size = len(page['hits']['hits'])
miro_ids = get_miro_ids(page)
f.write(json.dumps(miro_ids)+"\n")
print("Done!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment