Skip to content

Instantly share code, notes, and snippets.

@whs2k
Last active August 13, 2018 15:55
Show Gist options
  • Save whs2k/37ccc8b00c14db6b14de48541f9ff7bd to your computer and use it in GitHub Desktop.
Save whs2k/37ccc8b00c14db6b14de48541f9ff7bd to your computer and use it in GitHub Desktop.
Common Commands for Elastic Search
###
# 1. Read Index from Elastic Search with Pyspark as RDD
###
es_read_conf = {
"es.nodes":"node-1.XXX.YYY,node-2.XXX.YYY,node-3.XXX.YYY,node-4.XXX.YYY",
"es.port" : "9200",
"es.resource" : "INDEX/TYPE",
"es.net.http.auth.user":"USErNAME",
"es.net.http.auth.pass":"PASSWORD",
"es.nodes.wan.only":"false",
"es.net.ssl":"true",
"es.batch.size.entries":"5000"
#"es.query" : q
}
es_rdd = sc.newAPIHadoopRDD(
inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat",
keyClass="org.apache.hadoop.io.NullWritable",
valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
conf=es_read_conf)
###
# 2. Read Index from Elastic Search with Python
#Assume this index has a mapping text field of [_source][text]
###
from elasticsearch import Elasticsearch
#es = Elasticsearch("HOSTNAME.XXX.YYY:PORT")
ES_HOST = {
"host": "HOSTNAME.XXX.YYY",
"port": 9201
}
ES_INDEX = "ZZZZ"
ES_TYPE = "WWWW"
es = Elasticsearch(hosts=[ES_HOST], )
results_gen = elasticsearch.helpers.scan(
es,
query={"query": {"match_all": {}}},
index=ES_INDEX,
doc_type=ES_TYPE
)
for record in results_gen:
#Do Something such asL
text = record['_source']['text']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment