apfister/parse_csv_create_fc.py

## parse_csv_create_fc.py
import arcpy
import json
import csv
from datetime import datetime
import time

start_time = time.time()

arcpy.env.overwriteOutput = True

# geometry paths
geometry_dir = "geometry"

# Create insert cursor for table
working_path = "C:/Users/adam/Documents/ArcGIS/Projects/UNSD Data Import/sdg_data.gdb"
sdg_data_schema_template = '__sdg_data_schema_template'

fields = ["GoalId","TargetId","IndicatorRef","IndicatorRefOrder","IndicatorRefList","IndicatorId",
"IndicatorDesc","IndicatorFN","IndicatorTier","isRetired","SERIES","SERIES_Desc","SERIES_isOfficial","SERIES_show",
"sdgArea_Level","sdgArea_Order","sdgArea_Parent","REF_AREA","sdgArea_Name","isLDC","isLLDC","isSIDS",
"FREQ","SOURCE_TYPE","AGE_GROUP","LOCATION","SEX","VALUE_TYPE","UNIT","UNIT_MULT","TIME_PERIOD","OBS_VALUE","NATURE",
"FOOTNOTES","SOURCE_DETAIL","TIME_DETAIL", "SHAPE@"]

all_rows = {}

geometry_hash = {}

def parseCSV():
    # csv file
    csv_path = "input/sdg-data-all-test.csv"

    # i have no idea why i need to include this encoding flag
    # by some stroke of luck, i found the same issue and solution here: https://bugs.python.org/issue7185
    with open(csv_path, encoding="utf-8-sig") as csv_file:
        reader = csv.DictReader(csv_file)

        for row in reader:
            goal_id = row['GoalId']
            target_id = row['TargetId']
            indicator_ref = row['IndicatorRef']
            indicator_ref_order = row['IndicatorRefOrder']
            indicator_ref_list = row['IndicatorRefList']
            indicator_id = row['IndicatorId']

            indicator_desc = row['IndicatorDesc']
            indicator_fn = row['IndicatorFN']
            indicator_tier = row['IndicatorTier']
            is_retired = row['isRetired']
            series = row['SERIES']
            series_desc = row['SERIES_Desc']
            series_is_official = row['SERIES_isOfficial']
            series_show = row['SERIES_show']

            sdg_area_level = row['sdgArea_Level']
            if sdg_area_level == 'NULL':
                sdg_area_level = None

            sdg_area_order = row['sdgArea_Order']
            if sdg_area_order == 'NULL':
                sdg_area_order = None

            sdg_area_parent = row['sdgArea_Parent']
            if sdg_area_parent == 'NULL':
                sdg_area_parent = None

            ref_area = row['REF_AREA']
            sdg_area_name = row['sdgArea_Name']
            is_ldc = row['isLDC']
            if is_ldc == 'NULL':
                is_ldc = None

            is_lldc = row['isLLDC']
            if is_lldc == 'NULL':
                is_lldc = None
            is_sids = row['isSIDS']
            if is_sids == 'NULL':
                is_sids = None

            freq = row['FREQ']
            source_type = row['SOURCE_TYPE']
            age_group = row['AGE_GROUP']
            location = row['LOCATION']
            sex = row['SEX']
            value_type = row['VALUE_TYPE']
            unit = row['UNIT']
            unit_mult = row['UNIT_MULT']
            time_period = datetime.strptime(row['TIME_PERIOD'], '%Y')

            # remove any instances of '<', '>' or ' '
            obs_value = row['OBS_VALUE'].replace('>', '').replace('<', '').replace(' ', '')
            nature = row['NATURE']

            footnotes = row['FOOTNOTES']
            source_detail = row['SOURCE_DETAIL']
            time_detail = row['TIME_DETAIL']

            try:
                obs_value = float(row['OBS_VALUE'])
            except:
                pass

            row_hash_id = series

            row = (goal_id, target_id, indicator_ref, indicator_ref_order, indicator_ref_list, indicator_id,
                indicator_desc, indicator_fn, indicator_tier, is_retired, series, series_desc, series_is_official, series_show,
                sdg_area_level, sdg_area_order, sdg_area_parent, ref_area, sdg_area_name, is_ldc, is_lldc, is_sids,
                freq, source_type, age_group, location, sex, value_type, unit, unit_mult, time_period, obs_value, nature,
                footnotes, source_detail, time_detail)

            if row_hash_id not in all_rows:
                all_rows[row_hash_id] = []

            all_rows[row_hash_id].append(row)

    return all_rows

def createFeatureClass(fc_name):
    arcpy.CreateFeatureclass_management(working_path,
        fc_name,
        'POLYGON',
        working_path + '/' + sdg_data_schema_template)

def insertData():

    total_rows = len(all_rows.items())
    row_counter = 1

    for fc_name, fc_rows in all_rows.items():
        # print (fc_name, fc_rows, len(fc_rows))

        createFeatureClass(fc_name)

        # print ('created feature class ' + fc_name)

        insert_cursor = arcpy.da.InsertCursor(working_path + '/' + fc_name, fields)

        for row in fc_rows:

            ref_area = row[17]

            # try to load geometry from in-memory hash. otherwise, load from file and then store it in hash for future requests
            esri_geometry = None

            if ref_area not in geometry_hash:
                try:
                    geometry_path = geometry_dir + '/' + ref_area +'-esri.json'

                    raw_geometry = open(geometry_path).read()

                    geometry = json.loads(raw_geometry)

                    esri_geometry = arcpy.AsShape(geometry, True)

                    geometry_hash[ref_area] = esri_geometry
                except:
                    pass
            else:
                esri_geometry = geometry_hash[ref_area]

            row_list = list(row)
            row_list.append(esri_geometry);
            row_tuple = tuple(row_list)

            try:
                insert_cursor.insertRow( row_tuple )
            except:
                print ('error inserting row for ' + fc_name)
                print (row_tuple)
                raise


        print ('created ' + fc_name + ' with ' + str(len(fc_rows)) + ' rows. (' + str(row_counter) + ' of ' + str(total_rows) + ' total Series Datasets)')
        row_counter = row_counter + 1

        del row
        del fc_rows


if __name__ == "__main__":
    print ('reading csv file ..')

    parseCSV()

    print ('csv file loaded.')
    print ('inserting data ..')

    insertData()

    end_time = time.time()

    elapsed = end_time - start_time
    print ('done!')
    print ('total time : (' + elapsed + ') seconds')
	import arcpy
	import json
	import csv
	from datetime import datetime
	import time

	start_time = time.time()

	arcpy.env.overwriteOutput = True

	# geometry paths
	geometry_dir = "geometry"

	# Create insert cursor for table
	working_path = "C:/Users/adam/Documents/ArcGIS/Projects/UNSD Data Import/sdg_data.gdb"
	sdg_data_schema_template = '__sdg_data_schema_template'

	fields = ["GoalId","TargetId","IndicatorRef","IndicatorRefOrder","IndicatorRefList","IndicatorId",
	"IndicatorDesc","IndicatorFN","IndicatorTier","isRetired","SERIES","SERIES_Desc","SERIES_isOfficial","SERIES_show",
	"sdgArea_Level","sdgArea_Order","sdgArea_Parent","REF_AREA","sdgArea_Name","isLDC","isLLDC","isSIDS",
	"FREQ","SOURCE_TYPE","AGE_GROUP","LOCATION","SEX","VALUE_TYPE","UNIT","UNIT_MULT","TIME_PERIOD","OBS_VALUE","NATURE",
	"FOOTNOTES","SOURCE_DETAIL","TIME_DETAIL", "SHAPE@"]

	all_rows = {}

	geometry_hash = {}

	def parseCSV():
	# csv file
	csv_path = "input/sdg-data-all-test.csv"

	# i have no idea why i need to include this encoding flag
	# by some stroke of luck, i found the same issue and solution here: https://bugs.python.org/issue7185
	with open(csv_path, encoding="utf-8-sig") as csv_file:
	reader = csv.DictReader(csv_file)

	for row in reader:
	goal_id = row['GoalId']
	target_id = row['TargetId']
	indicator_ref = row['IndicatorRef']
	indicator_ref_order = row['IndicatorRefOrder']
	indicator_ref_list = row['IndicatorRefList']
	indicator_id = row['IndicatorId']

	indicator_desc = row['IndicatorDesc']
	indicator_fn = row['IndicatorFN']
	indicator_tier = row['IndicatorTier']
	is_retired = row['isRetired']
	series = row['SERIES']
	series_desc = row['SERIES_Desc']
	series_is_official = row['SERIES_isOfficial']
	series_show = row['SERIES_show']

	sdg_area_level = row['sdgArea_Level']
	if sdg_area_level == 'NULL':
	sdg_area_level = None

	sdg_area_order = row['sdgArea_Order']
	if sdg_area_order == 'NULL':
	sdg_area_order = None

	sdg_area_parent = row['sdgArea_Parent']
	if sdg_area_parent == 'NULL':
	sdg_area_parent = None

	ref_area = row['REF_AREA']
	sdg_area_name = row['sdgArea_Name']
	is_ldc = row['isLDC']
	if is_ldc == 'NULL':
	is_ldc = None

	is_lldc = row['isLLDC']
	if is_lldc == 'NULL':
	is_lldc = None
	is_sids = row['isSIDS']
	if is_sids == 'NULL':
	is_sids = None

	freq = row['FREQ']
	source_type = row['SOURCE_TYPE']
	age_group = row['AGE_GROUP']
	location = row['LOCATION']
	sex = row['SEX']
	value_type = row['VALUE_TYPE']
	unit = row['UNIT']
	unit_mult = row['UNIT_MULT']
	time_period = datetime.strptime(row['TIME_PERIOD'], '%Y')

	# remove any instances of '<', '>' or ' '
	obs_value = row['OBS_VALUE'].replace('>', '').replace('<', '').replace(' ', '')
	nature = row['NATURE']

	footnotes = row['FOOTNOTES']
	source_detail = row['SOURCE_DETAIL']
	time_detail = row['TIME_DETAIL']

	try:
	obs_value = float(row['OBS_VALUE'])
	except:
	pass

	row_hash_id = series

	row = (goal_id, target_id, indicator_ref, indicator_ref_order, indicator_ref_list, indicator_id,
	indicator_desc, indicator_fn, indicator_tier, is_retired, series, series_desc, series_is_official, series_show,
	sdg_area_level, sdg_area_order, sdg_area_parent, ref_area, sdg_area_name, is_ldc, is_lldc, is_sids,
	freq, source_type, age_group, location, sex, value_type, unit, unit_mult, time_period, obs_value, nature,
	footnotes, source_detail, time_detail)

	if row_hash_id not in all_rows:
	all_rows[row_hash_id] = []

	all_rows[row_hash_id].append(row)

	return all_rows

	def createFeatureClass(fc_name):
	arcpy.CreateFeatureclass_management(working_path,
	fc_name,
	'POLYGON',
	working_path + '/' + sdg_data_schema_template)

	def insertData():

	total_rows = len(all_rows.items())
	row_counter = 1

	for fc_name, fc_rows in all_rows.items():
	# print (fc_name, fc_rows, len(fc_rows))

	createFeatureClass(fc_name)

	# print ('created feature class ' + fc_name)

	insert_cursor = arcpy.da.InsertCursor(working_path + '/' + fc_name, fields)

	for row in fc_rows:

	ref_area = row[17]

	# try to load geometry from in-memory hash. otherwise, load from file and then store it in hash for future requests
	esri_geometry = None

	if ref_area not in geometry_hash:
	try:
	geometry_path = geometry_dir + '/' + ref_area +'-esri.json'

	raw_geometry = open(geometry_path).read()

	geometry = json.loads(raw_geometry)

	esri_geometry = arcpy.AsShape(geometry, True)

	geometry_hash[ref_area] = esri_geometry
	except:
	pass
	else:
	esri_geometry = geometry_hash[ref_area]

	row_list = list(row)
	row_list.append(esri_geometry);
	row_tuple = tuple(row_list)

	try:
	insert_cursor.insertRow( row_tuple )
	except:
	print ('error inserting row for ' + fc_name)
	print (row_tuple)
	raise


	print ('created ' + fc_name + ' with ' + str(len(fc_rows)) + ' rows. (' + str(row_counter) + ' of ' + str(total_rows) + ' total Series Datasets)')
	row_counter = row_counter + 1

	del row
	del fc_rows


	if __name__ == "__main__":
	print ('reading csv file ..')

	parseCSV()

	print ('csv file loaded.')
	print ('inserting data ..')

	insertData()

	end_time = time.time()

	elapsed = end_time - start_time
	print ('done!')
	print ('total time : (' + elapsed + ') seconds')