Created
May 23, 2019 15:46
-
-
Save ns-mkusper/cd315c122faeecb4c95629e0152a0fd2 to your computer and use it in GitHub Desktop.
Minimal working example of large ElasticSearch query result grabbing in python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from elasticsearch import Elasticsearch, RequestsHttpConnection | |
from requests_aws4auth import AWS4Auth | |
import elasticsearch.helpers | |
import boto3 | |
import datetime | |
import json | |
host = '' | |
region = '' | |
service = 'es' | |
query = '' | |
domain = '' | |
index = '' | |
credentials = boto3.Session().get_credentials() | |
awsauth = AWS4Auth( | |
credentials.access_key, | |
credentials.secret_key, | |
region, | |
service, | |
session_token=credentials.token) | |
es = Elasticsearch( | |
hosts=[{ | |
'host': host, | |
'port': 443 | |
}], | |
http_auth=awsauth, | |
use_ssl=True, | |
verify_certs=True, | |
connection_class=RequestsHttpConnection) | |
def run_scan(scan): | |
"""Return a list of items in ES scan generator object. | |
""" | |
# skip first scan object to get to the results | |
next(scan) | |
def scan_runner(): | |
index = 1 | |
for item in scan: | |
index += 1 | |
if (index % 1000) == 0: | |
print("Items read: " + index) | |
yield json.loads(json.dumps(item['_source'])) | |
return (list(scan_runner())) | |
scan_args = {'client': es, 'index': index, 'preserve_order': True} | |
if query: | |
scan_args['query'] = query | |
scanner = elasticsearch.helpers.scan(**scan_args) | |
scan_results = run_scan(scanner) | |
print(scan_results) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment