Skip to content

Instantly share code, notes, and snippets.

@scotthaleen
Last active August 29, 2015 14:24
Show Gist options
  • Save scotthaleen/bc0ac0743ac15073556f to your computer and use it in GitHub Desktop.
Save scotthaleen/bc0ac0743ac15073556f to your computer and use it in GitHub Desktop.
Spark Elastic Search List
#!/usr/bin/env python
from pyspark import SparkContext, SparkConf
import urllib2
import json
conf = SparkConf().setAppName("ES Search List")
sc = SparkContext(conf=conf)
url='localhost:9200'
index_name='index'
doc_type = 'doc_type'
def search(item):
#in this example just search for the field and return the hit count
query = {
"from":0,
"size":0,
"query":{
"match" : {
"some_field" : item
}
}
}
query = json.dumps(query)
response = urllib2.urlopen('http://{}/{}/{}/_search'.format(url, index_name, doc_type), query)
result = json.loads(response.read())
total = result['hits']['total']
return (item, total)
#search terms 1 term per line
items = [s.strip().lower() for s in open('input_list.txt').read().splitlines()]
#collect local or saveAsTextFile
results = sc.parallelize(items).map(search).collect()
fout = open("spark_hits.txt", 'w')
for item,c in results:
sz = "{}\t{}\n".format(item, c)
fout.write(sz)
fout.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment