msalvadores/rdflib_rdflists_access.py

## rdflib_rdflists_access.py
"""

Sample code to traverse RDF list with RDFLIB.
author: Manuel Salvadores (msalvadores@gmail.com)

rdf containers are a pain in general, quite annoying to handle them.

To get all the authors for a given article like in your case you could do
something like the code I am posting below.

I have added comments so that is sel-explained. The most important bit
is the use of `g.triple(triple_patter)` with this graph function basically
you can filter an rdflib Graph and search for the triple patterns you need.

When an rdf:Seq is parsed then predicates of the form :

`http://www.w3.org/1999/02/22-rdf-syntax-ns#_1`
"http://www.w3.org/1999/02/22-rdf-syntax-ns#_2"
"http://www.w3.org/1999/02/22-rdf-syntax-ns#_3"

are created, rdflib retrieve in random order so you need to sort them in order
to traverse them in the right order.
"""

import rdflib

RDF = rdflib.namespace.RDF

#Parse the file
g = rdflib.Graph()
g.parse("zot.rdf")

#So that we are sure we get something back
print "Number of triples",len(g)

#Couple of handy namespaces to use later
BIB = rdflib.Namespace("http://purl.org/net/biblio#")
FOAF = rdflib.Namespace("http://xmlns.com/foaf/0.1/")

#Author counter to print at the bottom
i=0

#Article for wich we want the list of authors
article = rdflib.term.URIRef("http://www.ncbi.nlm.nih.gov/pubmed/18273724")

#First loop filters is equivalent to "get all authors for article x"
for triple in g.triples((article,BIB["authors"],None)):

    #This expresions removes the rdf:type predicate cause we only want the bnodes
    # of the form http://www.w3.org/1999/02/22-rdf-syntax-ns#_SEQ_NUMBER
    # where SEQ_NUMBER is the index of the element in the rdf:Seq
    list_triples = filter(lambda y: RDF['type'] != y[1], g.triples((triple[2],None,None)))

    #We sort the authors by the predicate of the triple - order in sequences do matter ;-)
    # so "http://www.w3.org/1999/02/22-rdf-syntax-ns#_435"[44:] returns 435
    # and since we want numberic order we do int(x[1][44:]) - (BTW x[1] is the predicate)
    authors_sorted =  sorted(list_triples,key=lambda x: int(x[1][44:]))

    #We iterate the authors bNodes and we get surname and givenname
    for author_bnode in authors_sorted:
        for x in g.triples((author_bnode[2],FOAF['surname'],None)):
            author_surname = x[2]
        for y in g.triples((author_bnode[2],FOAF['givenname'],None)):
            author_name = y[2]
        print "author(%s): %s %s"%(i,author_name,author_surname)
        i += 1

print "Query"

rdflib.plugin.register('sparql', rdflib.query.Processor,
                       'rdfextras.sparql.processor', 'Processor')
rdflib.plugin.register('sparql', rdflib.query.Result,
                       'rdfextras.sparql.query', 'SPARQLQueryResult')

query = """
SELECT ?seq_index ?name ?surname WHERE {
     <http://www.ncbi.nlm.nih.gov/pubmed/18273724> bib:authors ?seq .
     ?seq ?seq_index ?seq_bnode .
     ?seq_bnode foaf:givenname ?name .
     ?seq_bnode foaf:surname ?surname .
}
"""
for row in sorted(g.query(query, initNs=dict(rdf=RDF,foaf=FOAF,bib=BIB)),
                                                        key=lambda x:int(x[0][44:])):
    print "Author(%s) %s %s"%(row[0][44:],row[1],row[2])
	"""

	Sample code to traverse RDF list with RDFLIB.
	author: Manuel Salvadores (msalvadores@gmail.com)

	rdf containers are a pain in general, quite annoying to handle them.

	To get all the authors for a given article like in your case you could do
	something like the code I am posting below.

	I have added comments so that is sel-explained. The most important bit
	is the use of `g.triple(triple_patter)` with this graph function basically
	you can filter an rdflib Graph and search for the triple patterns you need.

	When an rdf:Seq is parsed then predicates of the form :

	`http://www.w3.org/1999/02/22-rdf-syntax-ns#_1`
	"http://www.w3.org/1999/02/22-rdf-syntax-ns#_2"
	"http://www.w3.org/1999/02/22-rdf-syntax-ns#_3"

	are created, rdflib retrieve in random order so you need to sort them in order
	to traverse them in the right order.
	"""

	import rdflib

	RDF = rdflib.namespace.RDF

	#Parse the file
	g = rdflib.Graph()
	g.parse("zot.rdf")

	#So that we are sure we get something back
	print "Number of triples",len(g)

	#Couple of handy namespaces to use later
	BIB = rdflib.Namespace("http://purl.org/net/biblio#")
	FOAF = rdflib.Namespace("http://xmlns.com/foaf/0.1/")

	#Author counter to print at the bottom
	i=0

	#Article for wich we want the list of authors
	article = rdflib.term.URIRef("http://www.ncbi.nlm.nih.gov/pubmed/18273724")

	#First loop filters is equivalent to "get all authors for article x"
	for triple in g.triples((article,BIB["authors"],None)):

	#This expresions removes the rdf:type predicate cause we only want the bnodes
	# of the form http://www.w3.org/1999/02/22-rdf-syntax-ns#_SEQ_NUMBER
	# where SEQ_NUMBER is the index of the element in the rdf:Seq
	list_triples = filter(lambda y: RDF['type'] != y[1], g.triples((triple[2],None,None)))

	#We sort the authors by the predicate of the triple - order in sequences do matter ;-)
	# so "http://www.w3.org/1999/02/22-rdf-syntax-ns#_435"[44:] returns 435
	# and since we want numberic order we do int(x[1][44:]) - (BTW x[1] is the predicate)
	authors_sorted = sorted(list_triples,key=lambda x: int(x[1][44:]))

	#We iterate the authors bNodes and we get surname and givenname
	for author_bnode in authors_sorted:
	for x in g.triples((author_bnode[2],FOAF['surname'],None)):
	author_surname = x[2]
	for y in g.triples((author_bnode[2],FOAF['givenname'],None)):
	author_name = y[2]
	print "author(%s): %s %s"%(i,author_name,author_surname)
	i += 1

	print "Query"

	rdflib.plugin.register('sparql', rdflib.query.Processor,
	'rdfextras.sparql.processor', 'Processor')
	rdflib.plugin.register('sparql', rdflib.query.Result,
	'rdfextras.sparql.query', 'SPARQLQueryResult')

	query = """
	SELECT ?seq_index ?name ?surname WHERE {
	<http://www.ncbi.nlm.nih.gov/pubmed/18273724> bib:authors ?seq .
	?seq ?seq_index ?seq_bnode .
	?seq_bnode foaf:givenname ?name .
	?seq_bnode foaf:surname ?surname .
	}
	"""
	for row in sorted(g.query(query, initNs=dict(rdf=RDF,foaf=FOAF,bib=BIB)),
	key=lambda x:int(x[0][44:])):
	print "Author(%s) %s %s"%(row[0][44:],row[1],row[2])