Script to save some of the connected pages (subjects) for dbpedia.org resource in a tree in json file
click==5.1 | |
SPARQLWrapper==1.6.4 |
from SPARQLWrapper import SPARQLWrapper, JSON | |
from collections import namedtuple | |
import json | |
import random | |
Obj = namedtuple('Obj', 'id,label') | |
class Node(object): | |
def __init__(self, id, label): | |
self.id = id | |
self.label = label | |
self.children = [] | |
def to_json(self): | |
return { | |
'id': self.id, | |
'name': self.label, | |
'children': [c.to_json() for c in self.children] if self.children else None | |
} | |
class TreeFetch(object): | |
def __init__(self, counter_limit=50, depth=5): | |
self.visited = {} | |
self.counter = 0 | |
self.COUNTER_LIMIT = counter_limit | |
self.depth_limit = depth | |
def get_related(self, subject): | |
sparql = SPARQLWrapper("http://dbpedia.org/sparql") | |
sparql.setQuery(""" | |
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> | |
PREFIX dbo: <http://dbpedia.org/ontology/> | |
PREFIX dbr: <http://dbpedia.org/resource/> | |
SELECT * { | |
{ | |
SELECT (?obj as ?name) ?label | |
WHERE { | |
%s ?predicate ?obj. | |
FILTER(?predicate in (rdfs:seeAlso, dbo:knownFor, rdfs:isDefinedBy, dbo:ideology, dbo:nonFictionSubject) ) . | |
?obj rdfs:label ?label. | |
FILTER( lang(?label) = "en" || lang(?label) = "" ) . | |
} | |
} | |
UNION | |
{ | |
SELECT (?subject as ?name) ?label | |
WHERE { | |
?subject ?predicate %s. | |
FILTER(?predicate in (rdfs:seeAlso, dbo:knownFor, rdfs:isDefinedBy, dbo:ideology, dbo:nonFictionSubject) ) . | |
?subject rdfs:label ?label. | |
FILTER( lang(?label) = "en" || lang(?label) = "" ) . | |
} | |
} | |
} | |
LIMIT 15 | |
""" % (subject, subject)) | |
sparql.setReturnFormat(JSON) | |
results = sparql.query().convert() | |
# import ipdb; ipdb.set_trace() | |
# print('collect_all, subject={}, {} results'.format(subject, len(results["results"]["bindings"]))) | |
values = [Obj(r['name']['value'], r['label']['value']) for r in results["results"]["bindings"]] | |
return values | |
def select_depth(self, depth_limit): | |
# randomly selecting max_depth for each branch, so graph becomes more interesting | |
if not depth_limit: | |
depth_limit = random.randrange(self.depth_limit/2, self.depth_limit) +1 | |
print 'selected depth_limit={}'.format(depth_limit) | |
return depth_limit | |
def collect_all(self, subject, label, depth=1, depth_limit=None): | |
if subject in self.visited: | |
return | |
max_depth = depth_limit or self.depth_limit | |
if depth >= max_depth: | |
return | |
# self.counter+=1 | |
# if self.counter > self.COUNTER_LIMIT: | |
# return | |
self.visited[subject] = True | |
node = Node(subject, label) | |
related = self.get_related(subject) | |
for item in related: | |
if depth+1 >= max_depth: | |
break | |
branch_depth = self.select_depth(depth_limit) | |
item_children = self.collect_all('<{}>'.format(item.id), item.label, (depth+1), branch_depth) | |
if item_children: | |
node.children.append(item_children) | |
self.counter+=1 | |
if self.counter > self.COUNTER_LIMIT: | |
break | |
print('collect_all={}, depth={}, counter={}'.format(subject, depth, self.counter)) | |
return node | |
import click | |
import sys | |
@click.command() | |
@click.option('--limit', default=100, help='Number of nodes to generate') | |
@click.option('--max-depth', default=5, help='max depth of the tree') | |
@click.option('--file_path', default=None, help='file to save to.') | |
@click.argument('resource') | |
def main(limit, max_depth, file_path, resource): | |
random.seed(resource) | |
fetch = TreeFetch(counter_limit=limit, depth=max_depth) | |
res = fetch.collect_all("<http://dbpedia.org/resource/{}>".format(resource), resource) | |
f = open(file_path, 'wb') if file_path else sys.stdout | |
f.write(json.dumps(res.to_json())) | |
#print(res.to_json()) | |
# print('done') | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment