Created
February 17, 2015 04:52
-
-
Save tbonza2/8ccb828fd9cc5fc2e37f to your computer and use it in GitHub Desktop.
Error that I'm having related to http://stackoverflow.com/questions/28554658/possible-bug-in-python-data-structures
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Prepare for elasticsearch """ | |
import re | |
import os | |
import json | |
import pandas as pd | |
def get_directory_paths(location): | |
""" Returns dict of paths to all tab-delimited files | |
{ filename : filepath } | |
""" | |
locations = {} | |
filenames = os.listdir(location) | |
# clean filename so it makes a good tablename | |
pattern = r"\s|\.|-" | |
for filename in filenames: | |
clean_name = re.sub(pattern, "_", filename[:-4]) | |
locations[clean_name] = location + "/" + filename | |
return locations | |
def read_csvs(location_dict): | |
""" Read in csv and return pandas dataframe """ | |
output = {} | |
for location in location_dict.keys(): | |
output[location] = pd.read_csv(location_dict[location], | |
sep="\t", | |
encoding='utf-8') | |
return output | |
def first_body(outputdict, g_row): | |
body = {'BusinessName': outputdict['grades']['BusinessName']\ | |
.iloc[g_row], | |
'count_3': outputdict['grades']['count_3'].iloc[g_row], | |
'count_2': outputdict['grades']['count_2'].iloc[g_row], | |
'count_1': outputdict['grades']['count_1'].iloc[g_row], | |
'LICSTATUS': outputdict['grades']['LICSTATUS']\ | |
.iloc[g_row], | |
'EXPDTTM': outputdict['grades']['EXPDTTM'].iloc[g_row], | |
'LICENSENO': outputdict['grades']['LICENSENO']\ | |
.iloc[g_row], | |
'Location': { 'Address': outputdict['grades']\ | |
['Address'].iloc[g_row], | |
'City': outputdict['grades']['City']\ | |
.iloc[g_row], | |
'State': outputdict['grades']['State']\ | |
.iloc[g_row], | |
'ZIP': outputdict['grades']['ZIP']\ | |
.iloc[g_row], | |
'Latitude': '', | |
'Longitude': ''}, | |
'Grade': outputdict['grades']['Grade'].iloc[g_row], | |
'Meta-Attribute': ''} | |
return body | |
def second_body(body, findme): | |
body['Violations'] = [] | |
for v_row in xrange(len(findme)): | |
body['Violations'].append({'VIOLDTTM': \ | |
findme['VIOLDTTM'].iloc[v_row], | |
'ViolDate': \ | |
findme['ViolDate'].iloc[v_row], | |
'ViolLevel': \ | |
findme['ViolLevel'].iloc[v_row], | |
'ViolStatus': \ | |
findme['ViolStatus'].iloc[v_row], | |
'Violation': \ | |
findme['Violation'].iloc[v_row], | |
'ViolDesc': \ | |
findme['ViolDesc'].iloc[v_row], | |
'RESULT': \ | |
findme['RESULT'].iloc[v_row], | |
'ISSDTTM': \ | |
findme['ISSDTTM'].iloc[v_row], | |
'LICENSECAT': \ | |
findme['LICENSECAT'].iloc[v_row], | |
'Comments': \ | |
findme['Comments'].iloc[v_row]}) | |
return body | |
def to_elastic(outputdict): | |
""" Index the csv files into elastic search | |
Index needs to be of the form of this tuple: | |
(index='grades', doc_type = 'tabular', id="ID", body ={ | |
'BusinessName': '', | |
'count_3': '', | |
'count_2': '', | |
'count_1': '', | |
'LICSTATUS': '', | |
'EXPDTTM': '', | |
'LICENSENO': '', | |
'Location': { 'Address': '', | |
'City': '', | |
'State': '', | |
'ZIP': '', | |
'Latitude': '', | |
'Longitude': ''}, | |
'Grade': '', | |
'Violations': { 'VIOLDTTM': '', | |
'ViolDate': '', | |
'ViolLevel': '', | |
'ViolStatus': '', | |
'Violation': '', | |
'ViolDesc': '', | |
'RESULT': '', | |
'ISSDTTM': '', | |
'LICENSECAT': '', | |
'Comments': ''}, | |
'Meta-Attribute': ''} | |
) | |
Returns: | |
JSON to disk in the form of the above schema. That way we can | |
update it with things like lat/long. | |
""" | |
print "to_elastic() started\n" | |
outdict = {} | |
for g_row in xrange(len(outputdict['grades'])): | |
index = 'grades' | |
doc_type = 'tabular' | |
id_ = outputdict['grades']["ID"].iloc[g_row] | |
body = first_body(outputdict, g_row) | |
# need a faster implementation | |
findme = outputdict['violations']\ | |
[outputdict['violations']['ID'] == g_row] | |
body = second_body(body, findme) | |
if g_row == 1: | |
print "another check: %d " % len(body['Violations']) | |
outdict[id_] = {'index': index, | |
'doc_type': doc_type, | |
'id': id_, | |
'body': body} | |
if g_row == 1: | |
b = outdict[1]['body'] | |
print "more checks: %d " % len(b['Violations']) | |
print "to_elastic() completed\n" | |
b = outdict[1]['body'] | |
print "here's the number %d" % len(b['Violations']) | |
return outdict | |
if __name__ == "__main__": | |
print "script started" | |
location = '/home/tbonza/code/cleanpizza/data' | |
listdict = get_directory_paths(location) | |
outputdict = read_csvs(listdict) | |
data = to_elastic(outputdict) | |
with open(location + '/data.json', 'w') as outfile: | |
json.dump(data, outfile) | |
outfile.close() | |
print "script completed" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment