adamnagel/rdflib-perf-testing.py

## rdflib-perf-testing.py
from timeit import default_timer as timer
from rdflib import Namespace, Graph, Literal, ConjunctiveGraph
from rdflib.namespace import RDF, RDFS, DCTERMS
import xml.dom.minidom
import json
import xml.etree.cElementTree as ET
import re
from rdflib.store import NO_STORE, VALID_STORE


def time_msg(msg, start):
    print ('{:.4f} ms\t| {}'.format((timer() - start) * 1000, msg))


start = timer()
mg = Graph()
time_msg('creating graph in mem', start)

scg = ConjunctiveGraph(store='Sleepycat')
rt = scg.open('cat', create=False)

if rt == NO_STORE:
    # There is no underlying Sleepycat infrastructure, create it
    scg.open('cat', create=True)
else:
    assert rt == VALID_STORE, 'The underlying store is corrupt'
time_msg('creating graph in Sleepycat', start)

# Use the in-memory version
g = mg
# Use the Sleepycat version instead
# g = scg

def ResultDictViaXml(res):
    xml_qres = res.serialize()
    root = ET.fromstring(xml_qres)
    # print(root)

    new_res = []
    for result in root.findall('.//{http://www.w3.org/2005/sparql-results#}result'):
        # print(result)

        d = {}
        for binding in result.findall('.//{http://www.w3.org/2005/sparql-results#}binding'):
            # print (binding)

            name = binding.attrib['name']
            tag_value = binding.find('.*')
            # print (tag_value.text)

            d[name] = tag_value.text

        new_res.append(d)

    return new_res

ns_temp = Namespace('https://rdf.someplace.com/test-data#')

for iter in range(3):
    uri_subject = ns_temp['subject{}'.format(iter)]
    uri_predicate = ns_temp['verb{}'.format(iter)]

    start = timer()
    num_triples = 10000
    for i in range(num_triples):
        uri_object = Literal(i)

        g.add((uri_subject, uri_predicate, uri_object))

    time_msg('add {:,d} triples individually {}'.format(num_triples, iter), start)

    start = timer()
    qres = g.query(
        """SELECT DISTINCT ?s ?p
           WHERE {
              ?s ?p ?o
           }""")
    time_msg('query', start)

    start = timer()
    first = True
    for row in qres:
        if first:
            first = False
            time_msg('iterate first result'.format(len(qres)), start)

        pass
    time_msg('iterate {} results'.format(len(qres)), start)

    num_results = len(qres)

    qres = g.query(
        """SELECT DISTINCT ?s ?p
           WHERE {
              ?s ?p ?o
           }""")
    start = timer()
    for row in qres:
        row.asdict()
        # print (json.dumps(row.asdict(), indent=2))
    time_msg('iterate {} results (with .asdict())'.format(num_results), start)

    qres = g.query(
        """SELECT DISTINCT ?s ?p
           WHERE {
              ?s ?p ?o
           }""")
    start = timer()
    new_res = ResultDictViaXml(qres)
    time_msg('results via XML', start)

    qres = g.query(
        """SELECT DISTINCT ?s ?p
           WHERE {
              ?s ?p ?o
           }""")
    start = timer()
    qres.serialize(format='json')
    time_msg('results serialize JSON', start)

    qres = g.query(
        """SELECT DISTINCT ?s ?p
           WHERE {
              ?s ?p ?o
           }""")
    start = timer()
    qres.serialize(format='csv')
    time_msg('results serialize CSV', start)

g.serialize('test.ttl', format='ttl')

start = timer()
	from timeit import default_timer as timer
	from rdflib import Namespace, Graph, Literal, ConjunctiveGraph
	from rdflib.namespace import RDF, RDFS, DCTERMS
	import xml.dom.minidom
	import json
	import xml.etree.cElementTree as ET
	import re
	from rdflib.store import NO_STORE, VALID_STORE


	def time_msg(msg, start):
	print ('{:.4f} ms\t\| {}'.format((timer() - start) * 1000, msg))


	start = timer()
	mg = Graph()
	time_msg('creating graph in mem', start)

	scg = ConjunctiveGraph(store='Sleepycat')
	rt = scg.open('cat', create=False)

	if rt == NO_STORE:
	# There is no underlying Sleepycat infrastructure, create it
	scg.open('cat', create=True)
	else:
	assert rt == VALID_STORE, 'The underlying store is corrupt'
	time_msg('creating graph in Sleepycat', start)

	# Use the in-memory version
	g = mg
	# Use the Sleepycat version instead
	# g = scg

	def ResultDictViaXml(res):
	xml_qres = res.serialize()
	root = ET.fromstring(xml_qres)
	# print(root)

	new_res = []
	for result in root.findall('.//{http://www.w3.org/2005/sparql-results#}result'):
	# print(result)

	d = {}
	for binding in result.findall('.//{http://www.w3.org/2005/sparql-results#}binding'):
	# print (binding)

	name = binding.attrib['name']
	tag_value = binding.find('.*')
	# print (tag_value.text)

	d[name] = tag_value.text

	new_res.append(d)

	return new_res

	ns_temp = Namespace('https://rdf.someplace.com/test-data#')

	for iter in range(3):
	uri_subject = ns_temp['subject{}'.format(iter)]
	uri_predicate = ns_temp['verb{}'.format(iter)]

	start = timer()
	num_triples = 10000
	for i in range(num_triples):
	uri_object = Literal(i)

	g.add((uri_subject, uri_predicate, uri_object))

	time_msg('add {:,d} triples individually {}'.format(num_triples, iter), start)

	start = timer()
	qres = g.query(
	"""SELECT DISTINCT ?s ?p
	WHERE {
	?s ?p ?o
	}""")
	time_msg('query', start)

	start = timer()
	first = True
	for row in qres:
	if first:
	first = False
	time_msg('iterate first result'.format(len(qres)), start)

	pass
	time_msg('iterate {} results'.format(len(qres)), start)

	num_results = len(qres)

	qres = g.query(
	"""SELECT DISTINCT ?s ?p
	WHERE {
	?s ?p ?o
	}""")
	start = timer()
	for row in qres:
	row.asdict()
	# print (json.dumps(row.asdict(), indent=2))
	time_msg('iterate {} results (with .asdict())'.format(num_results), start)

	qres = g.query(
	"""SELECT DISTINCT ?s ?p
	WHERE {
	?s ?p ?o
	}""")
	start = timer()
	new_res = ResultDictViaXml(qres)
	time_msg('results via XML', start)

	qres = g.query(
	"""SELECT DISTINCT ?s ?p
	WHERE {
	?s ?p ?o
	}""")
	start = timer()
	qres.serialize(format='json')
	time_msg('results serialize JSON', start)

	qres = g.query(
	"""SELECT DISTINCT ?s ?p
	WHERE {
	?s ?p ?o
	}""")
	start = timer()
	qres.serialize(format='csv')
	time_msg('results serialize CSV', start)

	g.serialize('test.ttl', format='ttl')

	start = timer()