meiqimichelle/indexer.py

## indexer.py
# ############## Michelle: Excel to JSON to ES ##############

from xlrd import open_workbook
from pyelasticsearch import ElasticSearch
es = ElasticSearch('http://ec2-99-999-99-99.us-gov-west-1.compute.amazonaws.com:9200') # This is an example location. Change to local or other hosted location to use.

#create index first so you can set the mappings before you index data
es.create_index("unidatareport")


#create mapping for type "dataunit"
mapping = {
    "dataunit" : {
        "properties" : {
            "Company" : {"type" : "string", "index" : "not_analyzed"},
            "RevenueType" : {"type" : "string", "index" : "not_analyzed"},
            "Commodity" : {"type" : "string", "index" : "not_analyzed"},
            "Revenue" : {"type" : "integer"}
        }
    }
}
es.put_mapping("unidatareport", "dataunit", mapping)

# attempt to make this multifield, was not useful
# mapping = {
#     "dataunit" : {
#         "properties" : {
#             "Company" : {
#                 "type" : "multi_field",
#                 "fields" : {
#                     "Company" : {"type" : "string", "index" : "analyzed"},
#                     "raw" : { "type" : "string", "index" : "not_analyzed"}
#                 }
#             },
#             "RevenueType" : {"type" : "string"},
#             "Commodity" : {"type" : "string", "index" : "not_analyzed"},
#             "Revenue" : {"type" : "integer"}
#         }
#     }
# }
# es.put_mapping("unidatareport", "dataunit", mapping)


#iterate through file
def UniData(filename):

    #open file
    wb = open_workbook(filename, on_demand=True)
    sheet = wb.sheet_by_index(0)

    count = 0
    for s in wb.sheets():
        for row in range(s.nrows):
            data = {}
            count += 1
            for col in range(s.ncols):
                if col == 0:
                    key = 'Company'
                elif col == 1:
                    key = 'RevenueType'
                elif col == 2:
                    key = 'Commodity'
                elif col == 3:
                    key = 'Revenue'
                # else:
                #     print 'Unexpected number of columns'

                value = s.cell(row,col).value
                data[key] = value
            es.index("unidatareport", "dataunit", data, id=count)


UniData('uni_dummySet_apr17.xlsx')
	# ############## Michelle: Excel to JSON to ES ##############

	from xlrd import open_workbook
	from pyelasticsearch import ElasticSearch
	es = ElasticSearch('http://ec2-99-999-99-99.us-gov-west-1.compute.amazonaws.com:9200') # This is an example location. Change to local or other hosted location to use.

	#create index first so you can set the mappings before you index data
	es.create_index("unidatareport")


	#create mapping for type "dataunit"
	mapping = {
	"dataunit" : {
	"properties" : {
	"Company" : {"type" : "string", "index" : "not_analyzed"},
	"RevenueType" : {"type" : "string", "index" : "not_analyzed"},
	"Commodity" : {"type" : "string", "index" : "not_analyzed"},
	"Revenue" : {"type" : "integer"}
	}
	}
	}
	es.put_mapping("unidatareport", "dataunit", mapping)

	# attempt to make this multifield, was not useful
	# mapping = {
	# "dataunit" : {
	# "properties" : {
	# "Company" : {
	# "type" : "multi_field",
	# "fields" : {
	# "Company" : {"type" : "string", "index" : "analyzed"},
	# "raw" : { "type" : "string", "index" : "not_analyzed"}
	# }
	# },
	# "RevenueType" : {"type" : "string"},
	# "Commodity" : {"type" : "string", "index" : "not_analyzed"},
	# "Revenue" : {"type" : "integer"}
	# }
	# }
	# }
	# es.put_mapping("unidatareport", "dataunit", mapping)


	#iterate through file
	def UniData(filename):

	#open file
	wb = open_workbook(filename, on_demand=True)
	sheet = wb.sheet_by_index(0)

	count = 0
	for s in wb.sheets():
	for row in range(s.nrows):
	data = {}
	count += 1
	for col in range(s.ncols):
	if col == 0:
	key = 'Company'
	elif col == 1:
	key = 'RevenueType'
	elif col == 2:
	key = 'Commodity'
	elif col == 3:
	key = 'Revenue'
	# else:
	# print 'Unexpected number of columns'

	value = s.cell(row,col).value
	data[key] = value
	es.index("unidatareport", "dataunit", data, id=count)


	UniData('uni_dummySet_apr17.xlsx')