danelliottster/collect_values.py

## collect_values.py
import sys
import os , json

# uses the flatten library (available via GitHub)
from flatten_json import flatten

IGNORE_LIST = [ 'SeasonInfo|boundaries' ]

IGNORE_LIST_ADVANCED = [ 'LastModifedDate' , 'GrowerId' , 'FarmId' , 'FieldId' , 'FieldName' , 'FarmName' , 'seasonid' , 'seasongroupid' , 'metricid' , 'metricgroupid' , 'FieldCenterLon' , 'FieldCenterLat' , 'fertilizerApplication|values|dateofApplication' , 'fertilizerApplication|values|N' , 'fertilizerApplication|values|amountOfProductApplied' , 'fertilizerApplication|values|nApplied' , 'fertilizerApplication|values|pApplied' , 'cropprotectionplan|cropprotectionplandetails|dateofapplication' , 'planting|plantingDate' , 'harvest|fieldArea' , 'harvest|plantedArea' , 'harvest|harvest_yield|Cutting|yield' , 'SeasonInfo|fieldName' , 'SeasonInfo|plantingDate' , 'SeasonInfo|acres' , 'SeasonInfo|PreviousSeasonGroupID' , 'SeasonInfo|soil' , 'SeasonInfo|slopeLength' , 'SeasonInfo|nearbysurfacewater' , 'SeasonInfo|huc12WatershedCode', 'SeasonInfo|huc12WatershedName' , 'TownShip' , 'Range' , 'fpc_manid' , 'datecreated' ]

#
# create a list of keynames
# condense arrays into a single element
#

fh = open( 'sample.json , 'r' )
test_json = json.load( fh )
fh.close()
sample_json = flatten( test_json[0] , '|' )
base_values = {}

for key_name in sample_json.keys() :

    key_path = key_name.split( '|' )

    key_path_tmp = filter( lambda x: not x.isnumeric() , key_path )

    key_path_final = '|'.join( key_path_tmp )

    if key_path_final not in base_values :

        base_values[ key_path_final ] = []

for root , dirs , files in os.walk( 'directory_with_json_files' ) :

    path = root.split( os.sep )

    for file in files :

        if 'json' in file and file != 'unique_values.json' and file != 'value_counts.json':

            print( path , file )

            fh = open( root+'/'+file , 'r' )
            test_json = json.load( fh )
            fh.close()

            for field_json in test_json :

                tmp_json = flatten( field_json  , '|' )

                for key_name in tmp_json.keys() :

                    key_path = key_name.split( '|' )

                    key_path_tmp = filter( lambda x: not x.isnumeric() , key_path )

                    key_path_final = '|'.join( key_path_tmp )

                    if key_name not in IGNORE_LIST :

                        if key_path_final not in base_values :

                            base_values[ key_path_final ] = []

                        if type( tmp_json[ key_name ] ) == list :

                            base_values[ key_path_final ] +=  tmp_json[ key_name ]

                        else :

                            base_values[ key_path_final ] += [ tmp_json[ key_name ] ]

#
# what values are seen in the data
#

all_values = {}
for key_name in base_values :

    all_values[ key_name ] = list( set ( base_values[ key_name ] ) )


#
# provide potential values and stats for each field
#

selected_value_counts = {}
for key_name in base_values :

    if key_name not in IGNORE_LIST_ADVANCED :

        selected_value_counts[ key_name ] = []
        for value in all_values[ key_name ] :

            selected_value_counts[ key_name ] += [ { 'value' : value , 'count' : base_values[ key_name ].count( value ) } ]
	import sys
	import os , json

	# uses the flatten library (available via GitHub)
	from flatten_json import flatten

	IGNORE_LIST = [ 'SeasonInfo\|boundaries' ]

	IGNORE_LIST_ADVANCED = [ 'LastModifedDate' , 'GrowerId' , 'FarmId' , 'FieldId' , 'FieldName' , 'FarmName' , 'seasonid' , 'seasongroupid' , 'metricid' , 'metricgroupid' , 'FieldCenterLon' , 'FieldCenterLat' , 'fertilizerApplication\|values\|dateofApplication' , 'fertilizerApplication\|values\|N' , 'fertilizerApplication\|values\|amountOfProductApplied' , 'fertilizerApplication\|values\|nApplied' , 'fertilizerApplication\|values\|pApplied' , 'cropprotectionplan\|cropprotectionplandetails\|dateofapplication' , 'planting\|plantingDate' , 'harvest\|fieldArea' , 'harvest\|plantedArea' , 'harvest\|harvest_yield\|Cutting\|yield' , 'SeasonInfo\|fieldName' , 'SeasonInfo\|plantingDate' , 'SeasonInfo\|acres' , 'SeasonInfo\|PreviousSeasonGroupID' , 'SeasonInfo\|soil' , 'SeasonInfo\|slopeLength' , 'SeasonInfo\|nearbysurfacewater' , 'SeasonInfo\|huc12WatershedCode', 'SeasonInfo\|huc12WatershedName' , 'TownShip' , 'Range' , 'fpc_manid' , 'datecreated' ]

	#
	# create a list of keynames
	# condense arrays into a single element
	#

	fh = open( 'sample.json , 'r' )
	test_json = json.load( fh )
	fh.close()
	sample_json = flatten( test_json[0] , '\|' )
	base_values = {}

	for key_name in sample_json.keys() :

	key_path = key_name.split( '\|' )

	key_path_tmp = filter( lambda x: not x.isnumeric() , key_path )

	key_path_final = '\|'.join( key_path_tmp )

	if key_path_final not in base_values :

	base_values[ key_path_final ] = []

	for root , dirs , files in os.walk( 'directory_with_json_files' ) :

	path = root.split( os.sep )

	for file in files :

	if 'json' in file and file != 'unique_values.json' and file != 'value_counts.json':

	print( path , file )

	fh = open( root+'/'+file , 'r' )
	test_json = json.load( fh )
	fh.close()

	for field_json in test_json :

	tmp_json = flatten( field_json , '\|' )

	for key_name in tmp_json.keys() :

	key_path = key_name.split( '\|' )

	key_path_tmp = filter( lambda x: not x.isnumeric() , key_path )

	key_path_final = '\|'.join( key_path_tmp )

	if key_name not in IGNORE_LIST :

	if key_path_final not in base_values :

	base_values[ key_path_final ] = []

	if type( tmp_json[ key_name ] ) == list :

	base_values[ key_path_final ] += tmp_json[ key_name ]

	else :

	base_values[ key_path_final ] += [ tmp_json[ key_name ] ]

	#
	# what values are seen in the data
	#

	all_values = {}
	for key_name in base_values :

	all_values[ key_name ] = list( set ( base_values[ key_name ] ) )


	#
	# provide potential values and stats for each field
	#

	selected_value_counts = {}
	for key_name in base_values :

	if key_name not in IGNORE_LIST_ADVANCED :

	selected_value_counts[ key_name ] = []
	for value in all_values[ key_name ] :

	selected_value_counts[ key_name ] += [ { 'value' : value , 'count' : base_values[ key_name ].count( value ) } ]