Skip to content

Instantly share code, notes, and snippets.

@tbonza2
Created February 17, 2015 04:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tbonza2/8ccb828fd9cc5fc2e37f to your computer and use it in GitHub Desktop.
Save tbonza2/8ccb828fd9cc5fc2e37f to your computer and use it in GitHub Desktop.
""" Prepare for elasticsearch """
import re
import os
import json
import pandas as pd
def get_directory_paths(location):
""" Returns dict of paths to all tab-delimited files
{ filename : filepath }
"""
locations = {}
filenames = os.listdir(location)
# clean filename so it makes a good tablename
pattern = r"\s|\.|-"
for filename in filenames:
clean_name = re.sub(pattern, "_", filename[:-4])
locations[clean_name] = location + "/" + filename
return locations
def read_csvs(location_dict):
""" Read in csv and return pandas dataframe """
output = {}
for location in location_dict.keys():
output[location] = pd.read_csv(location_dict[location],
sep="\t",
encoding='utf-8')
return output
def first_body(outputdict, g_row):
body = {'BusinessName': outputdict['grades']['BusinessName']\
.iloc[g_row],
'count_3': outputdict['grades']['count_3'].iloc[g_row],
'count_2': outputdict['grades']['count_2'].iloc[g_row],
'count_1': outputdict['grades']['count_1'].iloc[g_row],
'LICSTATUS': outputdict['grades']['LICSTATUS']\
.iloc[g_row],
'EXPDTTM': outputdict['grades']['EXPDTTM'].iloc[g_row],
'LICENSENO': outputdict['grades']['LICENSENO']\
.iloc[g_row],
'Location': { 'Address': outputdict['grades']\
['Address'].iloc[g_row],
'City': outputdict['grades']['City']\
.iloc[g_row],
'State': outputdict['grades']['State']\
.iloc[g_row],
'ZIP': outputdict['grades']['ZIP']\
.iloc[g_row],
'Latitude': '',
'Longitude': ''},
'Grade': outputdict['grades']['Grade'].iloc[g_row],
'Meta-Attribute': ''}
return body
def second_body(body, findme):
body['Violations'] = []
for v_row in xrange(len(findme)):
body['Violations'].append({'VIOLDTTM': \
findme['VIOLDTTM'].iloc[v_row],
'ViolDate': \
findme['ViolDate'].iloc[v_row],
'ViolLevel': \
findme['ViolLevel'].iloc[v_row],
'ViolStatus': \
findme['ViolStatus'].iloc[v_row],
'Violation': \
findme['Violation'].iloc[v_row],
'ViolDesc': \
findme['ViolDesc'].iloc[v_row],
'RESULT': \
findme['RESULT'].iloc[v_row],
'ISSDTTM': \
findme['ISSDTTM'].iloc[v_row],
'LICENSECAT': \
findme['LICENSECAT'].iloc[v_row],
'Comments': \
findme['Comments'].iloc[v_row]})
return body
def to_elastic(outputdict):
""" Index the csv files into elastic search
Index needs to be of the form of this tuple:
(index='grades', doc_type = 'tabular', id="ID", body ={
'BusinessName': '',
'count_3': '',
'count_2': '',
'count_1': '',
'LICSTATUS': '',
'EXPDTTM': '',
'LICENSENO': '',
'Location': { 'Address': '',
'City': '',
'State': '',
'ZIP': '',
'Latitude': '',
'Longitude': ''},
'Grade': '',
'Violations': { 'VIOLDTTM': '',
'ViolDate': '',
'ViolLevel': '',
'ViolStatus': '',
'Violation': '',
'ViolDesc': '',
'RESULT': '',
'ISSDTTM': '',
'LICENSECAT': '',
'Comments': ''},
'Meta-Attribute': ''}
)
Returns:
JSON to disk in the form of the above schema. That way we can
update it with things like lat/long.
"""
print "to_elastic() started\n"
outdict = {}
for g_row in xrange(len(outputdict['grades'])):
index = 'grades'
doc_type = 'tabular'
id_ = outputdict['grades']["ID"].iloc[g_row]
body = first_body(outputdict, g_row)
# need a faster implementation
findme = outputdict['violations']\
[outputdict['violations']['ID'] == g_row]
body = second_body(body, findme)
if g_row == 1:
print "another check: %d " % len(body['Violations'])
outdict[id_] = {'index': index,
'doc_type': doc_type,
'id': id_,
'body': body}
if g_row == 1:
b = outdict[1]['body']
print "more checks: %d " % len(b['Violations'])
print "to_elastic() completed\n"
b = outdict[1]['body']
print "here's the number %d" % len(b['Violations'])
return outdict
if __name__ == "__main__":
print "script started"
location = '/home/tbonza/code/cleanpizza/data'
listdict = get_directory_paths(location)
outputdict = read_csvs(listdict)
data = to_elastic(outputdict)
with open(location + '/data.json', 'w') as outfile:
json.dump(data, outfile)
outfile.close()
print "script completed"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment