Last active
June 16, 2018 00:16
-
-
Save niceyeti/c8c9f64b27450d5a9c5d27233beb2c00 to your computer and use it in GitHub Desktop.
Python elasticsearch client failing scroll functionality
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
C:\Users\BillyBob\ES_Queries>python scrollTest.py | |
DXF1ZXJ5QW5kRmV0Y2gBAAAAAAAO-9cWR25VNkxzVjFTSVNubDRCdEoyTnFXQQ== | |
Traceback (most recent call last): | |
File "scrollTest.py", line 54, in main | |
getAllDocs2() | |
File "scrollTest.py", line 16, in getAllDocs2 | |
res2 = es.scroll(scroll_id = scroll, scroll = '1m') | |
File "C:\Python27\lib\site-packages\elasticsearch\client\utils.py", line 73, in _wrapped | |
return func(*args, params=params, **kwargs) | |
File "C:\Python27\lib\site-packages\elasticsearch\client\__init__.py", line 1033, in scroll | |
params=params, body=body) | |
File "C:\Python27\lib\site-packages\elasticsearch\transport.py", line 312, in perform_request | |
status, headers, data = connection.perform_request(method, url, params, body, ignore=ignore, timeout=timeout) | |
File "C:\Python27\lib\site-packages\elasticsearch\connection\http_urllib3.py", line 129, in perform_request | |
self._raise_error(response.status, raw_data) | |
File "C:\Python27\lib\site-packages\elasticsearch\connection\base.py", line 125, in _raise_error | |
raise HTTP_EXCEPTIONS.get(status_code, TransportError)(status_code, error_message, additional_info) | |
RequestError: TransportError(400, u'action_request_validation_exception', u'Validation Failed: 1: scrollId is missing;') | |
{u'_score': 1.0, u'_type': u'logs', u'_id': u'AV8LhQqZyn_BE1UV4cVV', u'_source': {u'host': u'**.**.**.**', u'@timestamp': u'2017-10-11T13:13:15.000Z', ... | |
{u'_score': 1.0, u'_type': u'logs', u'_id': u'AV8LhQqZyn_BE1UV4cVW', u'_source': {u'host': u'**.**.**.**', u'@timestamp': u'2017-10-11T13:13:15.000Z', ... | |
FAILED hitCount: 2 | |
Traceback (most recent call last): | |
File "scrollTest.py", line 42, in getAllDocs3 | |
for hit in s: | |
File "C:\Python27\lib\site-packages\elasticsearch\helpers\__init__.py", line 379, in scan | |
**scroll_kwargs) | |
File "C:\Python27\lib\site-packages\elasticsearch\client\utils.py", line 73, in _wrapped | |
return func(*args, params=params, **kwargs) | |
File "C:\Python27\lib\site-packages\elasticsearch\client\__init__.py", line 1033, in scroll | |
params=params, body=body) | |
File "C:\Python27\lib\site-packages\elasticsearch\transport.py", line 312, in perform_request | |
status, headers, data = connection.perform_request(method, url, params, body, ignore=ignore, timeout=timeout) | |
File "C:\Python27\lib\site-packages\elasticsearch\connection\http_urllib3.py", line 129, in perform_request | |
self._raise_error(response.status, raw_data) | |
File "C:\Python27\lib\site-packages\elasticsearch\connection\base.py", line 125, in _raise_error | |
raise HTTP_EXCEPTIONS.get(status_code, TransportError)(status_code, error_message, additional_info) | |
RequestError: TransportError(400, u'action_request_validation_exception', u'Validation Failed: 1: scrollId is missing;') | |
Total hits: 2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function | |
import elasticsearch as elasticsearch | |
import elasticsearch.helpers | |
import traceback | |
""" | |
Testing the retrieval of all documents using _search/scroll via | |
python client wrapper elasticsearch.helpers.scan() and search()/scroll(). | |
Elastic server version: 5.6.4, Lucene 6.6.1 | |
Python elasticsearch module version: VERSION = (5, 5, 2) | |
""" | |
def getAllDocs2(): | |
""" | |
Trying to retrieve all ~20k documents using search() and scroll() | |
methods of elasticsearch.Elasticsearch. This fails with message | |
"Validation Failed: scrollId is missing;". | |
""" | |
es = elasticsearch.Elasticsearch(["http://*.*.*.*:80/elasticsearch/"]) | |
qDict = {'query': {'match_all' : {}}} | |
index = "netflow-v5-2017.10.11" | |
#res = es.search(index=index, doc_type='myType', body=doc,scroll='1m') | |
res = es.search(index=index, body=qDict,scroll='3m') | |
scroll = res['_scroll_id'] | |
print(scroll) | |
res2 = es.scroll(scroll_id = scroll, scroll = '1m') | |
print(res2['_scroll_id']) | |
def getAllDocs3(): | |
""" | |
Another attempt to retrieve all ~20k documents using the elasticsearch.helpers.scan | |
wrapper method. This also fails with "Validation Failed: scrollId is missing;". | |
""" | |
es = elasticsearch.Elasticsearch(["http://*.*.*.*:80/elasticsearch/"]) | |
hitCount = 0 | |
index = "netflow-v5-2017.10.11" | |
qDict = {'query': {'match': {}}} | |
s = elasticsearch.helpers.scan(es, | |
query=qDict, | |
index=index, | |
size=2, | |
preserve_order=True, | |
clear_scroll=False, | |
scroll='5m' | |
) | |
try: | |
for hit in s: | |
hitCount += 1 | |
print(hit) | |
print("SUCCESS hitCount: {}".format(hitCount)) | |
except: | |
print("FAILED hitCount: {}".format(hitCount)) | |
traceback.print_exc() | |
print("Total hits: {}".format(hitCount)) | |
def main(): | |
try: | |
getAllDocs2() | |
except: | |
traceback.print_exc() | |
try: | |
getAllDocs3() | |
except: | |
traceback.print_exc() | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
See output.txt exception messages for what happens when getAllDocs2 and then getAllDocs3 are called from main. These are just attempts to dump all documents from an index, using the elasticsearch python clients, one using the helpers.scan() method and the other using search()+scroll(). The elastic version is 5.6.4, client is correct for elastic 5.x.x. I've tested other clients as well, and they all fail with a message about the 'scrollId' missing. However, using wireshark I can observe the clients sending back a b64-encoded scroll_id parameter to the correct '_search/scroll endpoint', but the the server returns a 400 bad request containing 'Validation Failed: 1: scrollId is missing;'. Using the kibana query testing console I am able to iterate documents using the '_search/scroll' endpoint correctly, so I at least know scrolling can succeed. But these python rest clients are failing , despite correctly sending back scroll_id to elastic after the first query.