tbonza2/populate.py

## populate.py
""" Prepare for elasticsearch """
import re
import os
import json
import pandas as pd

def get_directory_paths(location):
    """ Returns dict of paths to all tab-delimited files

    { filename : filepath }
    """
    locations = {}
    filenames = os.listdir(location)

    # clean filename so it makes a good tablename
    pattern = r"\s|\.|-"
    for filename in filenames:
        clean_name = re.sub(pattern, "_", filename[:-4])
        locations[clean_name] = location + "/" + filename

    return locations

def read_csvs(location_dict):
    """ Read in csv and return pandas dataframe """
    output = {}
    for location in location_dict.keys():
        output[location] = pd.read_csv(location_dict[location],
                                       sep="\t",
                                       encoding='utf-8')
    return output

def first_body(outputdict, g_row):
    body = {'BusinessName': outputdict['grades']['BusinessName']\
            .iloc[g_row],
            'count_3': outputdict['grades']['count_3'].iloc[g_row],
            'count_2': outputdict['grades']['count_2'].iloc[g_row],
            'count_1': outputdict['grades']['count_1'].iloc[g_row],
            'LICSTATUS': outputdict['grades']['LICSTATUS']\
            .iloc[g_row],
            'EXPDTTM': outputdict['grades']['EXPDTTM'].iloc[g_row],
            'LICENSENO': outputdict['grades']['LICENSENO']\
            .iloc[g_row],
            'Location': { 'Address': outputdict['grades']\
                          ['Address'].iloc[g_row],
                          'City': outputdict['grades']['City']\
                          .iloc[g_row],
                          'State': outputdict['grades']['State']\
                          .iloc[g_row],
                          'ZIP': outputdict['grades']['ZIP']\
                          .iloc[g_row],
                          'Latitude': '',
                          'Longitude': ''},
            'Grade': outputdict['grades']['Grade'].iloc[g_row],
            'Meta-Attribute': ''}
    return body

def second_body(body, findme):
    body['Violations'] = []

    for v_row in xrange(len(findme)):
        body['Violations'].append({'VIOLDTTM': \
                                   findme['VIOLDTTM'].iloc[v_row],
                                   'ViolDate': \
                                   findme['ViolDate'].iloc[v_row],
                                   'ViolLevel': \
                                   findme['ViolLevel'].iloc[v_row],
                                   'ViolStatus': \
                                   findme['ViolStatus'].iloc[v_row],
                                   'Violation': \
                                   findme['Violation'].iloc[v_row],
                                   'ViolDesc': \
                                   findme['ViolDesc'].iloc[v_row],
                                   'RESULT': \
                                   findme['RESULT'].iloc[v_row],
                                   'ISSDTTM': \
                                   findme['ISSDTTM'].iloc[v_row],
                                   'LICENSECAT': \
                                   findme['LICENSECAT'].iloc[v_row],
                                   'Comments': \
                                   findme['Comments'].iloc[v_row]})
    return body

def to_elastic(outputdict):
    """ Index the csv files into elastic search

    Index needs to be of the form of this tuple:

    (index='grades', doc_type = 'tabular', id="ID", body ={
    'BusinessName': '',
    'count_3': '',
    'count_2': '',
    'count_1': '',
    'LICSTATUS': '',
    'EXPDTTM': '',
    'LICENSENO': '',
    'Location': { 'Address': '',
                  'City': '',
                  'State': '',
                  'ZIP': '',
                  'Latitude': '',
                  'Longitude': ''},
    'Grade': '',
    'Violations': { 'VIOLDTTM': '',
                    'ViolDate': '',
                    'ViolLevel': '',
                    'ViolStatus': '',
                    'Violation': '',
                    'ViolDesc': '',
                    'RESULT': '',
                    'ISSDTTM': '',
                    'LICENSECAT': '',
                    'Comments': ''},
    'Meta-Attribute': ''}
    )

    Returns:
        JSON to disk in the form of the above schema. That way we can
        update it with things like lat/long.
    """
    print "to_elastic() started\n"
    outdict = {}
    for g_row in xrange(len(outputdict['grades'])):

        index = 'grades'
        doc_type = 'tabular'
        id_ = outputdict['grades']["ID"].iloc[g_row]

        body = first_body(outputdict, g_row)

        # need a faster implementation
        findme = outputdict['violations']\
                 [outputdict['violations']['ID'] == g_row]

        body = second_body(body, findme)

        if g_row == 1:
            print "another check: %d " % len(body['Violations'])

        outdict[id_] = {'index': index,
                        'doc_type': doc_type,
                        'id': id_,
                        'body': body}


        if g_row == 1:
            b = outdict[1]['body']
            print "more checks: %d " % len(b['Violations'])

    print "to_elastic() completed\n"

    b = outdict[1]['body']
    print "here's the number %d" % len(b['Violations'])
    return outdict


if __name__ == "__main__":
    print "script started"
    location = '/home/tbonza/code/cleanpizza/data'
    listdict = get_directory_paths(location)
    outputdict = read_csvs(listdict)
    data = to_elastic(outputdict)

    with open(location + '/data.json', 'w') as outfile:
        json.dump(data, outfile)
        outfile.close()

    print "script completed"
	""" Prepare for elasticsearch """
	import re
	import os
	import json
	import pandas as pd

	def get_directory_paths(location):
	""" Returns dict of paths to all tab-delimited files

	{ filename : filepath }
	"""
	locations = {}
	filenames = os.listdir(location)

	# clean filename so it makes a good tablename
	pattern = r"\s\|\.\|-"
	for filename in filenames:
	clean_name = re.sub(pattern, "_", filename[:-4])
	locations[clean_name] = location + "/" + filename

	return locations

	def read_csvs(location_dict):
	""" Read in csv and return pandas dataframe """
	output = {}
	for location in location_dict.keys():
	output[location] = pd.read_csv(location_dict[location],
	sep="\t",
	encoding='utf-8')
	return output

	def first_body(outputdict, g_row):
	body = {'BusinessName': outputdict['grades']['BusinessName']\
	.iloc[g_row],
	'count_3': outputdict['grades']['count_3'].iloc[g_row],
	'count_2': outputdict['grades']['count_2'].iloc[g_row],
	'count_1': outputdict['grades']['count_1'].iloc[g_row],
	'LICSTATUS': outputdict['grades']['LICSTATUS']\
	.iloc[g_row],
	'EXPDTTM': outputdict['grades']['EXPDTTM'].iloc[g_row],
	'LICENSENO': outputdict['grades']['LICENSENO']\
	.iloc[g_row],
	'Location': { 'Address': outputdict['grades']\
	['Address'].iloc[g_row],
	'City': outputdict['grades']['City']\
	.iloc[g_row],
	'State': outputdict['grades']['State']\
	.iloc[g_row],
	'ZIP': outputdict['grades']['ZIP']\
	.iloc[g_row],
	'Latitude': '',
	'Longitude': ''},
	'Grade': outputdict['grades']['Grade'].iloc[g_row],
	'Meta-Attribute': ''}
	return body

	def second_body(body, findme):
	body['Violations'] = []

	for v_row in xrange(len(findme)):
	body['Violations'].append({'VIOLDTTM': \
	findme['VIOLDTTM'].iloc[v_row],
	'ViolDate': \
	findme['ViolDate'].iloc[v_row],
	'ViolLevel': \
	findme['ViolLevel'].iloc[v_row],
	'ViolStatus': \
	findme['ViolStatus'].iloc[v_row],
	'Violation': \
	findme['Violation'].iloc[v_row],
	'ViolDesc': \
	findme['ViolDesc'].iloc[v_row],
	'RESULT': \
	findme['RESULT'].iloc[v_row],
	'ISSDTTM': \
	findme['ISSDTTM'].iloc[v_row],
	'LICENSECAT': \
	findme['LICENSECAT'].iloc[v_row],
	'Comments': \
	findme['Comments'].iloc[v_row]})
	return body

	def to_elastic(outputdict):
	""" Index the csv files into elastic search

	Index needs to be of the form of this tuple:

	(index='grades', doc_type = 'tabular', id="ID", body ={
	'BusinessName': '',
	'count_3': '',
	'count_2': '',
	'count_1': '',
	'LICSTATUS': '',
	'EXPDTTM': '',
	'LICENSENO': '',
	'Location': { 'Address': '',
	'City': '',
	'State': '',
	'ZIP': '',
	'Latitude': '',
	'Longitude': ''},
	'Grade': '',
	'Violations': { 'VIOLDTTM': '',
	'ViolDate': '',
	'ViolLevel': '',
	'ViolStatus': '',
	'Violation': '',
	'ViolDesc': '',
	'RESULT': '',
	'ISSDTTM': '',
	'LICENSECAT': '',
	'Comments': ''},
	'Meta-Attribute': ''}
	)

	Returns:
	JSON to disk in the form of the above schema. That way we can
	update it with things like lat/long.
	"""
	print "to_elastic() started\n"
	outdict = {}
	for g_row in xrange(len(outputdict['grades'])):

	index = 'grades'
	doc_type = 'tabular'
	id_ = outputdict['grades']["ID"].iloc[g_row]

	body = first_body(outputdict, g_row)

	# need a faster implementation
	findme = outputdict['violations']\
	[outputdict['violations']['ID'] == g_row]

	body = second_body(body, findme)

	if g_row == 1:
	print "another check: %d " % len(body['Violations'])

	outdict[id_] = {'index': index,
	'doc_type': doc_type,
	'id': id_,
	'body': body}


	if g_row == 1:
	b = outdict[1]['body']
	print "more checks: %d " % len(b['Violations'])

	print "to_elastic() completed\n"

	b = outdict[1]['body']
	print "here's the number %d" % len(b['Violations'])
	return outdict


	if __name__ == "__main__":
	print "script started"
	location = '/home/tbonza/code/cleanpizza/data'
	listdict = get_directory_paths(location)
	outputdict = read_csvs(listdict)
	data = to_elastic(outputdict)

	with open(location + '/data.json', 'w') as outfile:
	json.dump(data, outfile)
	outfile.close()

	print "script completed"