romanlv/example

## example
python resource_tree.py --limit=100 --file_path=data.json  Internet

## requirements.txt
click==5.1
SPARQLWrapper==1.6.4

## resource_tree.py
from SPARQLWrapper import SPARQLWrapper, JSON
from collections import namedtuple
import json
import random

Obj = namedtuple('Obj', 'id,label')

class Node(object):
    def __init__(self, id, label):
        self.id = id
        self.label = label
        self.children = []

    def to_json(self):
        return {
          'id': self.id,
          'name': self.label,
          'children': [c.to_json() for c in self.children] if self.children else None
        }

class TreeFetch(object):

    def __init__(self, counter_limit=50, depth=5):
        self.visited = {}
        self.counter = 0
        self.COUNTER_LIMIT = counter_limit
        self.depth_limit = depth


    def get_related(self, subject):
        sparql = SPARQLWrapper("http://dbpedia.org/sparql")
        sparql.setQuery("""
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
            PREFIX dbo: <http://dbpedia.org/ontology/>
            PREFIX dbr: <http://dbpedia.org/resource/>
            SELECT * {
            {
                SELECT (?obj as ?name) ?label
                WHERE {
                    %s ?predicate ?obj.
                    FILTER(?predicate in (rdfs:seeAlso, dbo:knownFor, rdfs:isDefinedBy, dbo:ideology, dbo:nonFictionSubject) ) .
                    ?obj rdfs:label ?label.
                    FILTER( lang(?label) = "en" || lang(?label) = "" ) .
                }
            }
            UNION
            {
                SELECT (?subject as ?name) ?label
                WHERE {
                    ?subject ?predicate %s.
                    FILTER(?predicate in (rdfs:seeAlso, dbo:knownFor, rdfs:isDefinedBy, dbo:ideology, dbo:nonFictionSubject) ) .
                    ?subject rdfs:label ?label.
                    FILTER( lang(?label) = "en" || lang(?label) = "" ) .
                }
            }
            }
            LIMIT 15
        """ % (subject, subject))
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()

        # import ipdb; ipdb.set_trace()
        # print('collect_all, subject={}, {} results'.format(subject, len(results["results"]["bindings"])))

        values = [Obj(r['name']['value'], r['label']['value']) for r in results["results"]["bindings"]]
        return values


    def select_depth(self, depth_limit):
        # randomly selecting max_depth for each branch, so graph becomes more interesting
        if not depth_limit:
            depth_limit = random.randrange(self.depth_limit/2, self.depth_limit) +1
            print 'selected depth_limit={}'.format(depth_limit)
        return depth_limit

    def collect_all(self, subject, label, depth=1, depth_limit=None):
        if subject in self.visited:
            return

        max_depth = depth_limit or self.depth_limit
        if depth >= max_depth:
            return

        # self.counter+=1
        # if self.counter > self.COUNTER_LIMIT:
        #     return

        self.visited[subject] = True

        node = Node(subject, label)

        related = self.get_related(subject)

        for item in related:
            if depth+1 >= max_depth:
                break

            branch_depth = self.select_depth(depth_limit)
            item_children = self.collect_all('<{}>'.format(item.id), item.label, (depth+1), branch_depth)

            if item_children:
                node.children.append(item_children)

            self.counter+=1
            if self.counter > self.COUNTER_LIMIT:
                break


        print('collect_all={}, depth={}, counter={}'.format(subject, depth, self.counter))
        return node


import click
import sys


@click.command()
@click.option('--limit', default=100, help='Number of nodes to generate')
@click.option('--max-depth', default=5, help='max depth of the tree')
@click.option('--file_path', default=None, help='file to save to.')
@click.argument('resource')
def main(limit, max_depth,  file_path, resource):

    random.seed(resource)

    fetch = TreeFetch(counter_limit=limit, depth=max_depth)
    res = fetch.collect_all("<http://dbpedia.org/resource/{}>".format(resource), resource)

    f = open(file_path, 'wb') if file_path else sys.stdout

    f.write(json.dumps(res.to_json()))
    #print(res.to_json())
    # print('done')


if  __name__ == "__main__":
    main()
	from SPARQLWrapper import SPARQLWrapper, JSON
	from collections import namedtuple
	import json
	import random

	Obj = namedtuple('Obj', 'id,label')

	class Node(object):
	def __init__(self, id, label):
	self.id = id
	self.label = label
	self.children = []

	def to_json(self):
	return {
	'id': self.id,
	'name': self.label,
	'children': [c.to_json() for c in self.children] if self.children else None
	}

	class TreeFetch(object):

	def __init__(self, counter_limit=50, depth=5):
	self.visited = {}
	self.counter = 0
	self.COUNTER_LIMIT = counter_limit
	self.depth_limit = depth


	def get_related(self, subject):
	sparql = SPARQLWrapper("http://dbpedia.org/sparql")
	sparql.setQuery("""
	PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
	PREFIX dbo: <http://dbpedia.org/ontology/>
	PREFIX dbr: <http://dbpedia.org/resource/>
	SELECT * {
	{
	SELECT (?obj as ?name) ?label
	WHERE {
	%s ?predicate ?obj.
	FILTER(?predicate in (rdfs:seeAlso, dbo:knownFor, rdfs:isDefinedBy, dbo:ideology, dbo:nonFictionSubject) ) .
	?obj rdfs:label ?label.
	FILTER( lang(?label) = "en" \|\| lang(?label) = "" ) .
	}
	}
	UNION
	{
	SELECT (?subject as ?name) ?label
	WHERE {
	?subject ?predicate %s.
	FILTER(?predicate in (rdfs:seeAlso, dbo:knownFor, rdfs:isDefinedBy, dbo:ideology, dbo:nonFictionSubject) ) .
	?subject rdfs:label ?label.
	FILTER( lang(?label) = "en" \|\| lang(?label) = "" ) .
	}
	}
	}
	LIMIT 15
	""" % (subject, subject))
	sparql.setReturnFormat(JSON)
	results = sparql.query().convert()

	# import ipdb; ipdb.set_trace()
	# print('collect_all, subject={}, {} results'.format(subject, len(results["results"]["bindings"])))

	values = [Obj(r['name']['value'], r['label']['value']) for r in results["results"]["bindings"]]
	return values


	def select_depth(self, depth_limit):
	# randomly selecting max_depth for each branch, so graph becomes more interesting
	if not depth_limit:
	depth_limit = random.randrange(self.depth_limit/2, self.depth_limit) +1
	print 'selected depth_limit={}'.format(depth_limit)
	return depth_limit

	def collect_all(self, subject, label, depth=1, depth_limit=None):
	if subject in self.visited:
	return

	max_depth = depth_limit or self.depth_limit
	if depth >= max_depth:
	return

	# self.counter+=1
	# if self.counter > self.COUNTER_LIMIT:
	# return

	self.visited[subject] = True

	node = Node(subject, label)

	related = self.get_related(subject)

	for item in related:
	if depth+1 >= max_depth:
	break

	branch_depth = self.select_depth(depth_limit)
	item_children = self.collect_all('<{}>'.format(item.id), item.label, (depth+1), branch_depth)

	if item_children:
	node.children.append(item_children)

	self.counter+=1
	if self.counter > self.COUNTER_LIMIT:
	break


	print('collect_all={}, depth={}, counter={}'.format(subject, depth, self.counter))
	return node



	import click
	import sys


	@click.command()
	@click.option('--limit', default=100, help='Number of nodes to generate')
	@click.option('--max-depth', default=5, help='max depth of the tree')
	@click.option('--file_path', default=None, help='file to save to.')
	@click.argument('resource')
	def main(limit, max_depth, file_path, resource):

	random.seed(resource)

	fetch = TreeFetch(counter_limit=limit, depth=max_depth)
	res = fetch.collect_all("<http://dbpedia.org/resource/{}>".format(resource), resource)

	f = open(file_path, 'wb') if file_path else sys.stdout

	f.write(json.dumps(res.to_json()))
	#print(res.to_json())
	# print('done')


	if __name__ == "__main__":
	main()