Skip to content

Instantly share code, notes, and snippets.

@mommi84
Created September 27, 2017 19:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mommi84/0881459ec9ed81957d63ea51e9ccc057 to your computer and use it in GitHub Desktop.
Save mommi84/0881459ec9ed81957d63ea51e9ccc057 to your computer and use it in GitHub Desktop.
For each (indexed) LOD dataset
#!/usr/bin/env python
import sys
import urllib2, urllib, httplib, json
reload(sys)
sys.setdefaultencoding("utf-8")
ENDPOINT = "http://stats.lod2.eu/sparql"
GRAPH = ""
BUFFER = 10000
DATASET = '<http://lodstats.aksw.org/ontology/ldso.owl#Dataset>'
DL_URL = '<http://www.w3.org/ns/dcat#downloadURL>'
def sparql_query(query):
param = dict()
param["default-graph-uri"] = GRAPH
param["query"] = query
param["format"] = "JSON"
param["CXML_redir_for_subjs"] = "121"
param["CXML_redir_for_hrefs"] = ""
param["timeout"] = "600000" # ten minutes - works with Virtuoso endpoints
param["debug"] = "on"
try:
resp = urllib2.urlopen(ENDPOINT + "?" + urllib.urlencode(param))
j = resp.read()
resp.close()
except (urllib2.HTTPError, httplib.BadStatusLine):
print "*** Query error. Empty result set. ***"
j = '{ "results": { "bindings": [] } }'
sys.stdout.flush()
return json.loads(j)
def sparql_query_all(query):
res = list()
offset = 0
while True:
qry = "{} OFFSET {} LIMIT {}".format(query, offset, BUFFER)
step_res = sparql_query(qry)
step = step_res['results']['bindings']
res += step
offset += BUFFER
if len(step) < BUFFER:
break
return res
datasets = sparql_query_all("SELECT ?s ?d WHERE { ?s a " + DATASET + " ; " + DL_URL + " ?d }")
print "{} datasets fetched.".format(len(datasets))
for d in datasets:
# ...
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment