gdbassett/json_to_jsonld.py

## json_to_jsonld.py
from urllib.parse import quote, unquote
import logging
from uuid import uuid4
from collections import defaultdict

def flatten(l):
    for el in l:
        if type(el) in [list, tuple]:
            yield from flatten(el)
        else:
            yield el

def json2linked(j, name, ns, unique_id=unique_id, properties=False, objects=False, simple_ids=True):
    # recurse the data to build json-ld objects
    data = json2linked_r(j, name, properties=properties, objects=objects)

    # add the data to a very basic context
    linked = {
        "@context":
            {
              "@base": ns,
              "@vocab": ns,
              "owl": "http://www.w3.org/2002/07/owl#",
              "xsd": "http://www.w3.org/2001/XMLSchema#",
              "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
              "rdfs": "http://www.w3.org/2000/01/rdf-schema#"
            },
        "@graph": data}

    # We use UUIDs to identify unique objects in the initial pass
    # Next we replace those with sequenced numbers

    if simple_ids:
        # First we collect a dictionary of all the objects (location is list will be their number)
        obj_map = defaultdict(list)
        for obj in linked['@graph']:
            if "owl:NamedIndividual" in obj["@type"]:
                obj_map[obj["@id"].split("_obj")[0]].append(obj["@id"])

        #print(obj_map)

        # Now we replaced the UUIDs with the location in the map list for more legible id numbers
        for j in range(len(linked['@graph'])):
            obj = linked['@graph'][j]
            short_id = obj['@id'].split("_obj")[0]
            if obj['@id'] in obj_map.get(short_id, []):
                obj['@id'] =  short_id if len(obj_map[short_id]) <= 1 else short_id + "_" + str(obj_map[short_id].index(obj['@id'])+1)
            for k in obj.keys():
                if type(obj[k]) == dict and "@id" in obj[k]:
                    short_id = obj[k]['@id'].split("_obj")[0]
                    if obj[k]['@id'] in obj_map.get(short_id, []):
                        obj[k]['@id'] =  short_id if len(obj_map[short_id]) <= 1 else short_id + "_" + str(obj_map[short_id].index(obj[k]['@id'])+1)
                elif type(obj[k]) == list:
                    for i in range(len(obj[k])):
                        if type(obj[k][i]) == dict and "@id" in obj[k][i]:
                            short_id = obj[k][i]['@id'].split("_obj")[0]
                            if obj[k][i]['@id'] in obj_map.get(short_id, []):
                                obj[k][i]['@id'] =  short_id if len(obj_map[short_id]) <= 1 else short_id + "_" + str(obj_map[short_id].index(obj[k][i]['@id']) + 1)
            linked['@graph'][j] = obj

    return(linked)

def json2linked_r(d, name, unique_id="", properties=False, objects=False):
    # if 'record', create an object to represent the record pointing at the root oject(s)
    type_map = {
        int: "xsd:integer",
        float: "xsd:float",
        str: "xsd:string",
        bool: "xsd:boolean"
    }

    ret = list()
    obj_names = set()
    #print("d: {0}, lbl: {1}, name: {2}, parent: {3}, child: {4}".format(d, lbl, name, parent, child))
    try:
        if type(d) == dict:
            # Create object for dict
            if objects:
                obj = {
                    "@id": quote(f"{name}_obj{unique_id}"),
                    "@type": ["owl:NamedIndividual", quote(name)]
                }
            else:
                obj = {
                    "@id": quote(f"{name}_obj{unique_id}"),
                    "@type": ["owl:NamedIndividual", "owl:Thing"]
                }
            for k, v in d.items():
                if properties:
                    obj_prop = {
                        "@id": quote(f"{name}.{k}"),
                        "@type": [],
                        "rdfs:range": []
                    }
                if type(v) in [bool, int, float, str]:
                    if properties:
                        obj_prop['@type'].append("owl:DatatypeProperty")
                        obj_prop['rdfs:range'].append("rdfs:Literal")
                    obj[quote(f"{name}.{k}")] = [{"@value": v, "@type": type_map[type(v)]}]
                elif type(v) in [dict]:
                    if properties:
                        obj_prop['@type'].append("owl:ObjectProperty")
                        obj_prop['rdfs:range'].append("owl:Thing")
                    u = "_" + str(uuid4())[:8]
                    obj[quote(f"{name}.{k}")] = [{"@id": quote(f"{name}.{k}_obj{u}")}]
                    ret += json2linked_r(v, f"{name}.{k}", u, properties=properties, objects=objects)
                elif type(v) in [list, tuple] and len(v) > 0:
                    v = flatten(v)
                    obj[quote(f"{name}.{k}")] = list()
                    for item in v:
                        if type(item) in [bool, int, float, str]:
                            if properties:
                                obj_prop['@type'].append("owl:DatatypeProperty")
                                obj_prop['rdfs:range'].append("rdfs:Literal")
                            obj[quote(f"{name}.{k}")].append({"@value": item, "@type": type_map[type(item)]})
                        elif type(item) in [dict, list, tuple]:
                            if properties:
                                obj_prop['@type'].append("owl:ObjectProperty")
                                obj_prop['rdfs:range'].append("owl:Thing")
                            u = "_" + str(uuid4())[:8]
                            obj[quote(f"{name}.{k}")].append({"@id": quote(f"{name}.{k}_obj{u}")})
                            ret += json2linked_r(item, f"{name}.{k}", u, properties=properties, objects=objects)
                else:
                    pass # skipping 'none' objects

                if properties:
                    obj_prop['rdfs:range'] = list(set(obj_prop['rdfs:range'])) # make unique
                    obj_prop['@type'] = list(set(obj_prop['@type'])) # make unique
                    if len(obj_prop['@type']) > 1:
                        logging.warning("object property {0}, ({1}) has @type longer than 1. That may cause problems.".format(obj_prop["@id"], obj_prop['@type']))
                    obj_prop['rdfs:range'] = [{"@id": rng} for rng in obj_prop['rdfs:range']] # add dict here because earlier breaks set function 3 lines above
                    ret.append(obj_prop)

                # Add in objects (or add type to property objects)
                if objects:
                    try:
                        i = [j['@id'] for j in ret].index(quote(f"{name}.{k}"))
                        obj_obj = ret[i]
                        obj_obj['@type'].append("owl:Thing")
                        obj_obj['@type'] = list(set(obj_obj['@type']))
                        ret[i] = obj_obj
                    except ValueError:
                        #print(ret, "\n")
                        #raise
                        ret.append({
                            '@id': quote(f"{name}.{k}"),
                            '@type': ["owl:Thing"]
                        })

            ret.append(obj)
        elif type(d) in [bool, int, float, str]:
            logging.warning("Shouldn't get here.")
        elif type(d) in [list, tuple] and len(d) > 0:
            d = flatten(d)
            for item in d:
                if type(item) is dict and len(item) > 0:
                    ret += json2linked_r(item, name, "", properties=properties, objects=objects)
                elif type(item) in [list, tuple]:
                    logging.warning("Flatten should prevent you getting a list of lists. name: {0}, d: {1}".format(name, d))
                elif type(item) in [bool, int, float, str]:
                    logging.warning("I don't think we should get a list of values. name: {0}, d: {1}".format(name, d))
                else:
                    pass # skip 'none' type
        else:
            pass # skip 'none'
            #logging.warning("json schema type 'null' not currently supported. d: {0}, name: {1}, parent: {2}".format(d, name, parent))


    except:
        print("d: {0}, name: {1}".format(d, name))
        raise

    return(ret)

# Build JSONLD Graph
graph = {"@context": {}, "@graph": []}
# we query the records separately so that the property paths are the same for each record
for name, record in records_dict.items():
    u = "_" + str(uuid4())[:8]
    j = json.loads(record)
    j['record'] = name
    # set set a unique_id so root records are unique.
    # we set objects = True so we can identify the root by it's object type
    # We set simple_ids = False to prevent collisions in IDs between the separately queried records
    # We _could_ set properties = True so that we could query paths of subProperties of the top properties
    jld = json2linked(j, "root", "https://example.org/ns#", unique_id=u, objects=True, simple_ids=False)
    graph['@context'] = jld['@context']
    graph['@graph'] += jld['@graph']
del(j)
del(jld)

# because we joined records with the same structure, objects and properties will be duplicated.
# here we delete duplicates
objs = set()
graph2 = list()
for obj in victor['@graph']:
   if obj['@id'] not in objs:
        objs.add(obj['@id'])
        graph2.append(obj)
graph['@graph'] = graph2
del(graph2)
	from urllib.parse import quote, unquote
	import logging
	from uuid import uuid4
	from collections import defaultdict

	def flatten(l):
	for el in l:
	if type(el) in [list, tuple]:
	yield from flatten(el)
	else:
	yield el

	def json2linked(j, name, ns, unique_id=unique_id, properties=False, objects=False, simple_ids=True):
	# recurse the data to build json-ld objects
	data = json2linked_r(j, name, properties=properties, objects=objects)

	# add the data to a very basic context
	linked = {
	"@context":
	{
	"@base": ns,
	"@vocab": ns,
	"owl": "http://www.w3.org/2002/07/owl#",
	"xsd": "http://www.w3.org/2001/XMLSchema#",
	"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
	"rdfs": "http://www.w3.org/2000/01/rdf-schema#"
	},
	"@graph": data}

	# We use UUIDs to identify unique objects in the initial pass
	# Next we replace those with sequenced numbers

	if simple_ids:
	# First we collect a dictionary of all the objects (location is list will be their number)
	obj_map = defaultdict(list)
	for obj in linked['@graph']:
	if "owl:NamedIndividual" in obj["@type"]:
	obj_map[obj["@id"].split("_obj")[0]].append(obj["@id"])

	#print(obj_map)

	# Now we replaced the UUIDs with the location in the map list for more legible id numbers
	for j in range(len(linked['@graph'])):
	obj = linked['@graph'][j]
	short_id = obj['@id'].split("_obj")[0]
	if obj['@id'] in obj_map.get(short_id, []):
	obj['@id'] = short_id if len(obj_map[short_id]) <= 1 else short_id + "_" + str(obj_map[short_id].index(obj['@id'])+1)
	for k in obj.keys():
	if type(obj[k]) == dict and "@id" in obj[k]:
	short_id = obj[k]['@id'].split("_obj")[0]
	if obj[k]['@id'] in obj_map.get(short_id, []):
	obj[k]['@id'] = short_id if len(obj_map[short_id]) <= 1 else short_id + "_" + str(obj_map[short_id].index(obj[k]['@id'])+1)
	elif type(obj[k]) == list:
	for i in range(len(obj[k])):
	if type(obj[k][i]) == dict and "@id" in obj[k][i]:
	short_id = obj[k][i]['@id'].split("_obj")[0]
	if obj[k][i]['@id'] in obj_map.get(short_id, []):
	obj[k][i]['@id'] = short_id if len(obj_map[short_id]) <= 1 else short_id + "_" + str(obj_map[short_id].index(obj[k][i]['@id']) + 1)
	linked['@graph'][j] = obj

	return(linked)

	def json2linked_r(d, name, unique_id="", properties=False, objects=False):
	# if 'record', create an object to represent the record pointing at the root oject(s)
	type_map = {
	int: "xsd:integer",
	float: "xsd:float",
	str: "xsd:string",
	bool: "xsd:boolean"
	}

	ret = list()
	obj_names = set()
	#print("d: {0}, lbl: {1}, name: {2}, parent: {3}, child: {4}".format(d, lbl, name, parent, child))
	try:
	if type(d) == dict:
	# Create object for dict
	if objects:
	obj = {
	"@id": quote(f"{name}_obj{unique_id}"),
	"@type": ["owl:NamedIndividual", quote(name)]
	}
	else:
	obj = {
	"@id": quote(f"{name}_obj{unique_id}"),
	"@type": ["owl:NamedIndividual", "owl:Thing"]
	}
	for k, v in d.items():
	if properties:
	obj_prop = {
	"@id": quote(f"{name}.{k}"),
	"@type": [],
	"rdfs:range": []
	}
	if type(v) in [bool, int, float, str]:
	if properties:
	obj_prop['@type'].append("owl:DatatypeProperty")
	obj_prop['rdfs:range'].append("rdfs:Literal")
	obj[quote(f"{name}.{k}")] = [{"@value": v, "@type": type_map[type(v)]}]
	elif type(v) in [dict]:
	if properties:
	obj_prop['@type'].append("owl:ObjectProperty")
	obj_prop['rdfs:range'].append("owl:Thing")
	u = "_" + str(uuid4())[:8]
	obj[quote(f"{name}.{k}")] = [{"@id": quote(f"{name}.{k}_obj{u}")}]
	ret += json2linked_r(v, f"{name}.{k}", u, properties=properties, objects=objects)
	elif type(v) in [list, tuple] and len(v) > 0:
	v = flatten(v)
	obj[quote(f"{name}.{k}")] = list()
	for item in v:
	if type(item) in [bool, int, float, str]:
	if properties:
	obj_prop['@type'].append("owl:DatatypeProperty")
	obj_prop['rdfs:range'].append("rdfs:Literal")
	obj[quote(f"{name}.{k}")].append({"@value": item, "@type": type_map[type(item)]})
	elif type(item) in [dict, list, tuple]:
	if properties:
	obj_prop['@type'].append("owl:ObjectProperty")
	obj_prop['rdfs:range'].append("owl:Thing")
	u = "_" + str(uuid4())[:8]
	obj[quote(f"{name}.{k}")].append({"@id": quote(f"{name}.{k}_obj{u}")})
	ret += json2linked_r(item, f"{name}.{k}", u, properties=properties, objects=objects)
	else:
	pass # skipping 'none' objects

	if properties:
	obj_prop['rdfs:range'] = list(set(obj_prop['rdfs:range'])) # make unique
	obj_prop['@type'] = list(set(obj_prop['@type'])) # make unique
	if len(obj_prop['@type']) > 1:
	logging.warning("object property {0}, ({1}) has @type longer than 1. That may cause problems.".format(obj_prop["@id"], obj_prop['@type']))
	obj_prop['rdfs:range'] = [{"@id": rng} for rng in obj_prop['rdfs:range']] # add dict here because earlier breaks set function 3 lines above
	ret.append(obj_prop)

	# Add in objects (or add type to property objects)
	if objects:
	try:
	i = [j['@id'] for j in ret].index(quote(f"{name}.{k}"))
	obj_obj = ret[i]
	obj_obj['@type'].append("owl:Thing")
	obj_obj['@type'] = list(set(obj_obj['@type']))
	ret[i] = obj_obj
	except ValueError:
	#print(ret, "\n")
	#raise
	ret.append({
	'@id': quote(f"{name}.{k}"),
	'@type': ["owl:Thing"]
	})

	ret.append(obj)
	elif type(d) in [bool, int, float, str]:
	logging.warning("Shouldn't get here.")
	elif type(d) in [list, tuple] and len(d) > 0:
	d = flatten(d)
	for item in d:
	if type(item) is dict and len(item) > 0:
	ret += json2linked_r(item, name, "", properties=properties, objects=objects)
	elif type(item) in [list, tuple]:
	logging.warning("Flatten should prevent you getting a list of lists. name: {0}, d: {1}".format(name, d))
	elif type(item) in [bool, int, float, str]:
	logging.warning("I don't think we should get a list of values. name: {0}, d: {1}".format(name, d))
	else:
	pass # skip 'none' type
	else:
	pass # skip 'none'
	#logging.warning("json schema type 'null' not currently supported. d: {0}, name: {1}, parent: {2}".format(d, name, parent))



	except:
	print("d: {0}, name: {1}".format(d, name))
	raise

	return(ret)

	# Build JSONLD Graph
	graph = {"@context": {}, "@graph": []}
	# we query the records separately so that the property paths are the same for each record
	for name, record in records_dict.items():
	u = "_" + str(uuid4())[:8]
	j = json.loads(record)
	j['record'] = name
	# set set a unique_id so root records are unique.
	# we set objects = True so we can identify the root by it's object type
	# We set simple_ids = False to prevent collisions in IDs between the separately queried records
	# We _could_ set properties = True so that we could query paths of subProperties of the top properties
	jld = json2linked(j, "root", "https://example.org/ns#", unique_id=u, objects=True, simple_ids=False)
	graph['@context'] = jld['@context']
	graph['@graph'] += jld['@graph']
	del(j)
	del(jld)

	# because we joined records with the same structure, objects and properties will be duplicated.
	# here we delete duplicates
	objs = set()
	graph2 = list()
	for obj in victor['@graph']:
	if obj['@id'] not in objs:
	objs.add(obj['@id'])
	graph2.append(obj)
	graph['@graph'] = graph2
	del(graph2)