Skip to content

Instantly share code, notes, and snippets.

@romanlv
Last active October 7, 2015 04:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save romanlv/ffac9b420ef817052820 to your computer and use it in GitHub Desktop.
Save romanlv/ffac9b420ef817052820 to your computer and use it in GitHub Desktop.
Script to save some of the connected pages (subjects) for dbpedia.org resource in a tree in json file
python resource_tree.py --limit=100 --file_path=data.json Internet
click==5.1
SPARQLWrapper==1.6.4
from SPARQLWrapper import SPARQLWrapper, JSON
from collections import namedtuple
import json
import random
Obj = namedtuple('Obj', 'id,label')
class Node(object):
def __init__(self, id, label):
self.id = id
self.label = label
self.children = []
def to_json(self):
return {
'id': self.id,
'name': self.label,
'children': [c.to_json() for c in self.children] if self.children else None
}
class TreeFetch(object):
def __init__(self, counter_limit=50, depth=5):
self.visited = {}
self.counter = 0
self.COUNTER_LIMIT = counter_limit
self.depth_limit = depth
def get_related(self, subject):
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setQuery("""
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX dbr: <http://dbpedia.org/resource/>
SELECT * {
{
SELECT (?obj as ?name) ?label
WHERE {
%s ?predicate ?obj.
FILTER(?predicate in (rdfs:seeAlso, dbo:knownFor, rdfs:isDefinedBy, dbo:ideology, dbo:nonFictionSubject) ) .
?obj rdfs:label ?label.
FILTER( lang(?label) = "en" || lang(?label) = "" ) .
}
}
UNION
{
SELECT (?subject as ?name) ?label
WHERE {
?subject ?predicate %s.
FILTER(?predicate in (rdfs:seeAlso, dbo:knownFor, rdfs:isDefinedBy, dbo:ideology, dbo:nonFictionSubject) ) .
?subject rdfs:label ?label.
FILTER( lang(?label) = "en" || lang(?label) = "" ) .
}
}
}
LIMIT 15
""" % (subject, subject))
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
# import ipdb; ipdb.set_trace()
# print('collect_all, subject={}, {} results'.format(subject, len(results["results"]["bindings"])))
values = [Obj(r['name']['value'], r['label']['value']) for r in results["results"]["bindings"]]
return values
def select_depth(self, depth_limit):
# randomly selecting max_depth for each branch, so graph becomes more interesting
if not depth_limit:
depth_limit = random.randrange(self.depth_limit/2, self.depth_limit) +1
print 'selected depth_limit={}'.format(depth_limit)
return depth_limit
def collect_all(self, subject, label, depth=1, depth_limit=None):
if subject in self.visited:
return
max_depth = depth_limit or self.depth_limit
if depth >= max_depth:
return
# self.counter+=1
# if self.counter > self.COUNTER_LIMIT:
# return
self.visited[subject] = True
node = Node(subject, label)
related = self.get_related(subject)
for item in related:
if depth+1 >= max_depth:
break
branch_depth = self.select_depth(depth_limit)
item_children = self.collect_all('<{}>'.format(item.id), item.label, (depth+1), branch_depth)
if item_children:
node.children.append(item_children)
self.counter+=1
if self.counter > self.COUNTER_LIMIT:
break
print('collect_all={}, depth={}, counter={}'.format(subject, depth, self.counter))
return node
import click
import sys
@click.command()
@click.option('--limit', default=100, help='Number of nodes to generate')
@click.option('--max-depth', default=5, help='max depth of the tree')
@click.option('--file_path', default=None, help='file to save to.')
@click.argument('resource')
def main(limit, max_depth, file_path, resource):
random.seed(resource)
fetch = TreeFetch(counter_limit=limit, depth=max_depth)
res = fetch.collect_all("<http://dbpedia.org/resource/{}>".format(resource), resource)
f = open(file_path, 'wb') if file_path else sys.stdout
f.write(json.dumps(res.to_json()))
#print(res.to_json())
# print('done')
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment