Skip to content

Instantly share code, notes, and snippets.

@ravenscroftj
Created November 21, 2015 09:31
Show Gist options
  • Save ravenscroftj/64620a1a65a8d0098941 to your computer and use it in GitHub Desktop.
Save ravenscroftj/64620a1a65a8d0098941 to your computer and use it in GitHub Desktop.
from __future__ import print_function
import elasticsearch
from itertools import chain
#scroll pointer timeout in minutes 1 minute is usually fine but you can increase if you get timeout errors
TIMEOUT = "1m"
def scrollr(client, scroll_id, f):
r = client.scroll(scroll_id, scroll="10m")
while len(r['hits']['hits']) > 0:
for item in r['hits']['hits']:
yield f(item)
r = client.scroll(scroll_id, scroll="10m")
def extract_references(article):
print (article['_source'].keys())
return article['_source']['References']
if __name__ == "__main__":
es = elasticsearch.Elasticsearch()
r = es.search("impact_studies", "study", q="*:*", search_type="scan", scroll="10m")
scroll_id = r['_scroll_id']
for ref in scrollr(es, scroll_id, extract_references):
print(ref)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment