Skip to content

Instantly share code, notes, and snippets.

@cjmcgraw
Created October 20, 2017 20:08
Show Gist options
  • Save cjmcgraw/bc2f8842367adc6675e731647812225a to your computer and use it in GitHub Desktop.
Save cjmcgraw/bc2f8842367adc6675e731647812225a to your computer and use it in GitHub Desktop.
#! /usr/bin/env python
import subprocess
import argparse
import httplib
import json
parser = argparse.ArgumentParser(description="print out all documents in an index to stdout")
parser.add_argument("--index", type=str)
parser.add_argument("--host", type=str)
parser.add_argument("--port", default=9200, type=int)
args = parser.parse_args()
def get_all_documents(index, host=args.host, port=args.port):
domain = '{}:{}'.format(host, port)
start_output = subprocess.check_output("curl --silent -XPOST '{domain}/{index}/_search?scroll=10m&size=1000'".format(
domain=domain, index=index), shell=True)
def parse_output(output):
obj = json.loads(output)
if '_scroll_id' not in obj:
raise ValueError("missing _scroll_id in top level json..")
if "hits" not in obj and "hits" not in obj['hits']:
raise ValueError("missing hits.hits in top level json..")
return obj['_scroll_id'], map(lambda x: x['_source'], obj['hits']['hits'])
scroll_id, documents = parse_output(start_output)
while len(documents) > 0 and scroll_id:
for doc in documents:
yield json.dumps(doc)
output = subprocess.check_output("curl --silent -XPOST '{domain}/_search/scroll' --data '{data}'".format(
domain=domain, data=json.dumps({"scroll": "10m", "scroll_id": scroll_id})), shell=True)
scroll_id, documents = parse_output(output)
for document in get_all_documents(args.index):
print document
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment