scotthaleen/spark_es_search_list.py

## spark_es_search_list.py
#!/usr/bin/env python
from pyspark import SparkContext, SparkConf

import urllib2
import json

conf = SparkConf().setAppName("ES Search List")
sc = SparkContext(conf=conf)

url='localhost:9200'
index_name='index'
doc_type = 'doc_type'

def search(item):
    #in this example just search for the field and return the hit count
    query = {
        "from":0,
        "size":0,
        "query":{
            "match" : {
                "some_field" : item
            }
        }
    }
    query = json.dumps(query)
    response = urllib2.urlopen('http://{}/{}/{}/_search'.format(url, index_name, doc_type), query)
    result = json.loads(response.read())
    total = result['hits']['total']
    return (item, total)

#search terms 1 term per line
items = [s.strip().lower() for s in open('input_list.txt').read().splitlines()]

#collect local or saveAsTextFile
results = sc.parallelize(items).map(search).collect()

fout = open("spark_hits.txt", 'w')
for item,c in results:
    sz =  "{}\t{}\n".format(item, c)
    fout.write(sz)

fout.close()
	#!/usr/bin/env python
	from pyspark import SparkContext, SparkConf

	import urllib2
	import json

	conf = SparkConf().setAppName("ES Search List")
	sc = SparkContext(conf=conf)

	url='localhost:9200'
	index_name='index'
	doc_type = 'doc_type'

	def search(item):
	#in this example just search for the field and return the hit count
	query = {
	"from":0,
	"size":0,
	"query":{
	"match" : {
	"some_field" : item
	}
	}
	}
	query = json.dumps(query)
	response = urllib2.urlopen('http://{}/{}/{}/_search'.format(url, index_name, doc_type), query)
	result = json.loads(response.read())
	total = result['hits']['total']
	return (item, total)

	#search terms 1 term per line
	items = [s.strip().lower() for s in open('input_list.txt').read().splitlines()]

	#collect local or saveAsTextFile
	results = sc.parallelize(items).map(search).collect()

	fout = open("spark_hits.txt", 'w')
	for item,c in results:
	sz = "{}\t{}\n".format(item, c)
	fout.write(sz)

	fout.close()