pythononwheels/json2cerberus.py

## json2cerberus.py
#
# convert json data to a cerberus schema.
# Cerberus types see here: http://docs.python-cerberus.org/en/stable/validation-rules.html#type
#
# sampledata: https://www.json-generator.com/
#
# this uses the first data element in a given json file to create
# a model(cerberus) schema from it. Trying to guess the right types (without too much effort)
#

import simplejson as json
import re
from dateutil.parser import parse
import sys
import click
from collections import OrderedDict

uuid = re.compile('[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}', re.I)
line_lenght=90
def is_date(string):
    try:
        parse(string)
        return True
    except ValueError:
        return False

@click.command()
@click.option('--infile', help='json file to read')
@click.option('--start_element', default=0, help="Element to process, if json file contains a list. Default=0")
def json_to_cerberus(infile, start_element):
    """
        Convert json data (infile)
        to a (simple) cerberus schema.
        If the json contains a list of elements you can specify the
        element to inspect. Default is the first element (0)

        This is meant to be a conveniance utility to take away all the initial typing
        when dealing with longer json formats...
        You can then easily add resl constraints to the schema afterwards.
    """
    cerberus_schema = {}
    # sample output schema format:
    # schema = {'name': {'type': 'string'} }
    print("opening json data file: {}".format(infile))
    f = open(infile,"r")
    # already covers bool, list, dict
    try:
        raw_data=f.read()
        data = json.loads(raw_data)
    except Exception as e:
        raise e

    mydata=data[start_element]
    # inspect the json.
    for elem in mydata:
        if isinstance(mydata[elem], bool):
            cerberus_schema[elem] = {"type" : "boolean" }
        elif isinstance(mydata[elem], int):
            cerberus_schema[elem] = {"type" : "integer" }
        elif isinstance(mydata[elem], float):
            cerberus_schema[elem] = {"type" : "float" }
        elif isinstance(mydata[elem], list):
            cerberus_schema[elem] = {"type" : "list" }
        elif isinstance(mydata[elem], dict):
            cerberus_schema[elem] = {"type" : "dictionary" }
        elif isinstance(mydata[elem], str):
            # check if sring is a date format...
            if is_date(mydata[elem]):
                cerberus_schema[elem] = {"type" : "datetime" }
                # todo check if it is a dat (date = datetime without h:m:s:.xx)
            else:
                cerberus_schema[elem] = {"type" : "string" }
        elif isinstance(mydata[elem], bytes) or isinstance(mydata[elem], bytearray):
            cerberus_schema[elem] = {"type" : "binary" }
        else:
            cerberus_schema[elem] = {"type" : "string" }
            print("type unknown, setting string.")
    from pprint import PrettyPrinter
    pp = PrettyPrinter(indent=4)
    print(line_lenght*"-")
    print("|  json data: {}".format(infile) )
    print(line_lenght*"-")
    pp.pprint(mydata)
    #for elem in mydata:
    #    print("    {:20} {}".format('"'+elem+'"', mydata[elem] ))
    print(line_lenght*"-")
    print("|  Model schema for: {}".format(infile) )
    print(line_lenght*"-")
    print("schema = { ")
    oschema = OrderedDict(sorted(cerberus_schema.items(), key=lambda t: t[0]))
    for elem in oschema:
        print("    {:20} {}".format('"'+elem+'"', oschema[elem] ))

    print("}")
    print(line_lenght*"-")
    print("|   you can copy&paste this right into your model for example."  )
    print(line_lenght*"-")

if __name__ == "__main__":
    json_to_cerberus()
	#
	# convert json data to a cerberus schema.
	# Cerberus types see here: http://docs.python-cerberus.org/en/stable/validation-rules.html#type
	#
	# sampledata: https://www.json-generator.com/
	#
	# this uses the first data element in a given json file to create
	# a model(cerberus) schema from it. Trying to guess the right types (without too much effort)
	#

	import simplejson as json
	import re
	from dateutil.parser import parse
	import sys
	import click
	from collections import OrderedDict

	uuid = re.compile('[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}', re.I)
	line_lenght=90
	def is_date(string):
	try:
	parse(string)
	return True
	except ValueError:
	return False

	@click.command()
	@click.option('--infile', help='json file to read')
	@click.option('--start_element', default=0, help="Element to process, if json file contains a list. Default=0")
	def json_to_cerberus(infile, start_element):
	"""
	Convert json data (infile)
	to a (simple) cerberus schema.
	If the json contains a list of elements you can specify the
	element to inspect. Default is the first element (0)

	This is meant to be a conveniance utility to take away all the initial typing
	when dealing with longer json formats...
	You can then easily add resl constraints to the schema afterwards.
	"""
	cerberus_schema = {}
	# sample output schema format:
	# schema = {'name': {'type': 'string'} }
	print("opening json data file: {}".format(infile))
	f = open(infile,"r")
	# already covers bool, list, dict
	try:
	raw_data=f.read()
	data = json.loads(raw_data)
	except Exception as e:
	raise e

	mydata=data[start_element]
	# inspect the json.
	for elem in mydata:
	if isinstance(mydata[elem], bool):
	cerberus_schema[elem] = {"type" : "boolean" }
	elif isinstance(mydata[elem], int):
	cerberus_schema[elem] = {"type" : "integer" }
	elif isinstance(mydata[elem], float):
	cerberus_schema[elem] = {"type" : "float" }
	elif isinstance(mydata[elem], list):
	cerberus_schema[elem] = {"type" : "list" }
	elif isinstance(mydata[elem], dict):
	cerberus_schema[elem] = {"type" : "dictionary" }
	elif isinstance(mydata[elem], str):
	# check if sring is a date format...
	if is_date(mydata[elem]):
	cerberus_schema[elem] = {"type" : "datetime" }
	# todo check if it is a dat (date = datetime without h:m:s:.xx)
	else:
	cerberus_schema[elem] = {"type" : "string" }
	elif isinstance(mydata[elem], bytes) or isinstance(mydata[elem], bytearray):
	cerberus_schema[elem] = {"type" : "binary" }
	else:
	cerberus_schema[elem] = {"type" : "string" }
	print("type unknown, setting string.")
	from pprint import PrettyPrinter
	pp = PrettyPrinter(indent=4)
	print(line_lenght*"-")
	print("\| json data: {}".format(infile) )
	print(line_lenght*"-")
	pp.pprint(mydata)
	#for elem in mydata:
	# print(" {:20} {}".format('"'+elem+'"', mydata[elem] ))
	print(line_lenght*"-")
	print("\| Model schema for: {}".format(infile) )
	print(line_lenght*"-")
	print("schema = { ")
	oschema = OrderedDict(sorted(cerberus_schema.items(), key=lambda t: t[0]))
	for elem in oschema:
	print(" {:20} {}".format('"'+elem+'"', oschema[elem] ))

	print("}")
	print(line_lenght*"-")
	print("\| you can copy&paste this right into your model for example." )
	print(line_lenght*"-")

	if __name__ == "__main__":
	json_to_cerberus()