TimRepke/README.md

## README.md

      
    Raw
  

              README.md
            
          
    ElasticSearch Downloader

Very basic script for downloading a specific index from ElasticSearch into a file containing one document per line (json-formatted).
The argparse should be pretty self explanatory.
Call the script by:
python download.py --url=http://my.elastic.search.eu --port=9200 --index=my_index --out=/path/to/output/

Result will be stored in /path/to/output/my_index.json. Leaving the port argument will neglect ignore the port (might be useful if your ES ist behind nginx. other tools default to adding a default port).

  
## download.py
import json
import requests
import os
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--url', help='URL to ES server', required=True)
parser.add_argument('--port', type=int, help='port for ES server')
parser.add_argument('--index', help='index to back up', required=True)
parser.add_argument('--batch-size', default=100, type=int, help='num of docs to fetch in one request')
parser.add_argument('--user', type=str, help='username for httpauth')
parser.add_argument('--passwd', type=str, help='password for httpauth')
parser.add_argument('--out', type=str, help='directory to backup to', required=True)
args = parser.parse_args()
print(args)

if __name__ == "__main__":
	baseurl = args.url
	auth = None
	if args.port:
		baseurl += ':' + str(args.port)
	if args.user and args.passwd:
		auth = (args.user, args.passwd)

	# init scroll:
	r = requests.post(baseurl+'/' + args.index+'/_search?scroll=1m', auth=auth, json={'size': args.batch_size})

	scroll = r.json()
	print('writing to ' + os.path.abspath(os.path.join(args.out, args.index+'.json')))
	with open(os.path.abspath(os.path.join(args.out, args.index+'.json')), 'w') as f_out:
		cnt = 0
		while True:
			r = requests.post(baseurl+'/_search/scroll', auth=auth, json={'scroll': '1m', 'scroll_id': scroll['_scroll_id']})

			batch = r.json()
			batch_size = len(batch['hits']['hits'])
			cnt += batch_size
			print('Received {} docs of {} total | {} ({:.2f}%)'.format(batch_size, batch['hits']['total'], cnt, (cnt/ batch['hits']['total'])*100.0))

			if batch_size==0:
				break

			for doc in batch['hits']['hits']:
				f_out.write(json.dumps(doc)+'\n')
	import json
	import requests
	import os
	import argparse

	parser = argparse.ArgumentParser()
	parser.add_argument('--url', help='URL to ES server', required=True)
	parser.add_argument('--port', type=int, help='port for ES server')
	parser.add_argument('--index', help='index to back up', required=True)
	parser.add_argument('--batch-size', default=100, type=int, help='num of docs to fetch in one request')
	parser.add_argument('--user', type=str, help='username for httpauth')
	parser.add_argument('--passwd', type=str, help='password for httpauth')
	parser.add_argument('--out', type=str, help='directory to backup to', required=True)
	args = parser.parse_args()
	print(args)

	if __name__ == "__main__":
	baseurl = args.url
	auth = None
	if args.port:
	baseurl += ':' + str(args.port)
	if args.user and args.passwd:
	auth = (args.user, args.passwd)

	# init scroll:
	r = requests.post(baseurl+'/' + args.index+'/_search?scroll=1m', auth=auth, json={'size': args.batch_size})

	scroll = r.json()
	print('writing to ' + os.path.abspath(os.path.join(args.out, args.index+'.json')))
	with open(os.path.abspath(os.path.join(args.out, args.index+'.json')), 'w') as f_out:
	cnt = 0
	while True:
	r = requests.post(baseurl+'/_search/scroll', auth=auth, json={'scroll': '1m', 'scroll_id': scroll['_scroll_id']})

	batch = r.json()
	batch_size = len(batch['hits']['hits'])
	cnt += batch_size
	print('Received {} docs of {} total \| {} ({:.2f}%)'.format(batch_size, batch['hits']['total'], cnt, (cnt/ batch['hits']['total'])*100.0))

	if batch_size==0:
	break

	for doc in batch['hits']['hits']:
	f_out.write(json.dumps(doc)+'\n')