Skip to content

Instantly share code, notes, and snippets.

@meiqimichelle
Last active August 29, 2015 14:00
Show Gist options
  • Save meiqimichelle/11298042 to your computer and use it in GitHub Desktop.
Save meiqimichelle/11298042 to your computer and use it in GitHub Desktop.
Index excel data to elasticsearch
# ############## Michelle: Excel to JSON to ES ##############
from xlrd import open_workbook
from pyelasticsearch import ElasticSearch
es = ElasticSearch('http://ec2-99-999-99-99.us-gov-west-1.compute.amazonaws.com:9200') # This is an example location. Change to local or other hosted location to use.
#create index first so you can set the mappings before you index data
es.create_index("unidatareport")
#create mapping for type "dataunit"
mapping = {
"dataunit" : {
"properties" : {
"Company" : {"type" : "string", "index" : "not_analyzed"},
"RevenueType" : {"type" : "string", "index" : "not_analyzed"},
"Commodity" : {"type" : "string", "index" : "not_analyzed"},
"Revenue" : {"type" : "integer"}
}
}
}
es.put_mapping("unidatareport", "dataunit", mapping)
# attempt to make this multifield, was not useful
# mapping = {
# "dataunit" : {
# "properties" : {
# "Company" : {
# "type" : "multi_field",
# "fields" : {
# "Company" : {"type" : "string", "index" : "analyzed"},
# "raw" : { "type" : "string", "index" : "not_analyzed"}
# }
# },
# "RevenueType" : {"type" : "string"},
# "Commodity" : {"type" : "string", "index" : "not_analyzed"},
# "Revenue" : {"type" : "integer"}
# }
# }
# }
# es.put_mapping("unidatareport", "dataunit", mapping)
#iterate through file
def UniData(filename):
#open file
wb = open_workbook(filename, on_demand=True)
sheet = wb.sheet_by_index(0)
count = 0
for s in wb.sheets():
for row in range(s.nrows):
data = {}
count += 1
for col in range(s.ncols):
if col == 0:
key = 'Company'
elif col == 1:
key = 'RevenueType'
elif col == 2:
key = 'Commodity'
elif col == 3:
key = 'Revenue'
# else:
# print 'Unexpected number of columns'
value = s.cell(row,col).value
data[key] = value
es.index("unidatareport", "dataunit", data, id=count)
UniData('uni_dummySet_apr17.xlsx')
@meiqimichelle
Copy link
Author

@nmalcolm Yup, you're right -- I was shown the light about data access after this script was written. That location is a sandbox for test sites, and the data is dummy data. The real stuff will be architectured differently (...and by someone that does backend stuff, aka not me!). Now that I know, though, I'm thinking about removing the location from this gist anyways -- probably a better practice.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment