Last active
August 29, 2015 14:00
-
-
Save meiqimichelle/11298042 to your computer and use it in GitHub Desktop.
Index excel data to elasticsearch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ############## Michelle: Excel to JSON to ES ############## | |
from xlrd import open_workbook | |
from pyelasticsearch import ElasticSearch | |
es = ElasticSearch('http://ec2-99-999-99-99.us-gov-west-1.compute.amazonaws.com:9200') # This is an example location. Change to local or other hosted location to use. | |
#create index first so you can set the mappings before you index data | |
es.create_index("unidatareport") | |
#create mapping for type "dataunit" | |
mapping = { | |
"dataunit" : { | |
"properties" : { | |
"Company" : {"type" : "string", "index" : "not_analyzed"}, | |
"RevenueType" : {"type" : "string", "index" : "not_analyzed"}, | |
"Commodity" : {"type" : "string", "index" : "not_analyzed"}, | |
"Revenue" : {"type" : "integer"} | |
} | |
} | |
} | |
es.put_mapping("unidatareport", "dataunit", mapping) | |
# attempt to make this multifield, was not useful | |
# mapping = { | |
# "dataunit" : { | |
# "properties" : { | |
# "Company" : { | |
# "type" : "multi_field", | |
# "fields" : { | |
# "Company" : {"type" : "string", "index" : "analyzed"}, | |
# "raw" : { "type" : "string", "index" : "not_analyzed"} | |
# } | |
# }, | |
# "RevenueType" : {"type" : "string"}, | |
# "Commodity" : {"type" : "string", "index" : "not_analyzed"}, | |
# "Revenue" : {"type" : "integer"} | |
# } | |
# } | |
# } | |
# es.put_mapping("unidatareport", "dataunit", mapping) | |
#iterate through file | |
def UniData(filename): | |
#open file | |
wb = open_workbook(filename, on_demand=True) | |
sheet = wb.sheet_by_index(0) | |
count = 0 | |
for s in wb.sheets(): | |
for row in range(s.nrows): | |
data = {} | |
count += 1 | |
for col in range(s.ncols): | |
if col == 0: | |
key = 'Company' | |
elif col == 1: | |
key = 'RevenueType' | |
elif col == 2: | |
key = 'Commodity' | |
elif col == 3: | |
key = 'Revenue' | |
# else: | |
# print 'Unexpected number of columns' | |
value = s.cell(row,col).value | |
data[key] = value | |
es.index("unidatareport", "dataunit", data, id=count) | |
UniData('uni_dummySet_apr17.xlsx') |
@nmalcolm Yup, you're right -- I was shown the light about data access after this script was written. That location is a sandbox for test sites, and the data is dummy data. The real stuff will be architectured differently (...and by someone that does backend stuff, aka not me!). Now that I know, though, I'm thinking about removing the location from this gist anyways -- probably a better practice.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I'm pretty sure this is meant to be either behind a firewall or password protected. There's nothing stopping me or anyone else deleting/modifying the data. If it's providing data for government websites or entities that's even more of a concern.