Skip to content

Instantly share code, notes, and snippets.

@danbri
Created November 10, 2016 17:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save danbri/ad684a50872fffb30e0bbd2c22ea3e18 to your computer and use it in GitHub Desktop.
Save danbri/ad684a50872fffb30e0bbd2c22ea3e18 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
from selenium import webdriver
from bs4 import BeautifulSoup
from rdflib import Graph, plugin
import json, rdflib_jsonld
from rdflib.plugin import register, Serializer
from SPARQLWrapper import SPARQLWrapper
import warnings
u = "http://danbri.org/2016/browserdemo/helloworld.html"
# This is a quick example showing the use of Selenium (headless webdriver browser)
# as a preprocessor for extraction of JSON-LD from Web content, allowing JSON-LD
# to be injected by scripts into pages. We could do similar for Microdata/RDFa.
#
# Dan Brickley <danbri@google.com>
register('json-ld', Serializer, 'rdflib_jsonld.serializer', 'JsonLDSerializer')
browser = webdriver.Firefox()
#browser = webdriver.Chrome() # TODO
browser.get(u)
pagetext = browser.page_source
with warnings.catch_warnings():
try:
warnings.simplefilter("ignore")
browser.close()
browser.quit()
except e:
print "..."
soup = BeautifulSoup(pagetext, 'lxml')
print "Extracting script tags."
for tag in soup.find_all('script'):
tt = str(tag.get('type',None))
if tt.endswith("application/ld+json"):
myJsonLd = tag.get_text()
g = Graph()
g.parse(data=myJsonLd, format='json-ld', base=u)
g.close()
for s,p,o in g.triples( (None, None, None) ):
print "%s %s %s"%(s,p,o)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment