Created
November 10, 2016 17:46
-
-
Save danbri/ad684a50872fffb30e0bbd2c22ea3e18 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: UTF-8 -*- | |
from selenium import webdriver | |
from bs4 import BeautifulSoup | |
from rdflib import Graph, plugin | |
import json, rdflib_jsonld | |
from rdflib.plugin import register, Serializer | |
from SPARQLWrapper import SPARQLWrapper | |
import warnings | |
u = "http://danbri.org/2016/browserdemo/helloworld.html" | |
# This is a quick example showing the use of Selenium (headless webdriver browser) | |
# as a preprocessor for extraction of JSON-LD from Web content, allowing JSON-LD | |
# to be injected by scripts into pages. We could do similar for Microdata/RDFa. | |
# | |
# Dan Brickley <danbri@google.com> | |
register('json-ld', Serializer, 'rdflib_jsonld.serializer', 'JsonLDSerializer') | |
browser = webdriver.Firefox() | |
#browser = webdriver.Chrome() # TODO | |
browser.get(u) | |
pagetext = browser.page_source | |
with warnings.catch_warnings(): | |
try: | |
warnings.simplefilter("ignore") | |
browser.close() | |
browser.quit() | |
except e: | |
print "..." | |
soup = BeautifulSoup(pagetext, 'lxml') | |
print "Extracting script tags." | |
for tag in soup.find_all('script'): | |
tt = str(tag.get('type',None)) | |
if tt.endswith("application/ld+json"): | |
myJsonLd = tag.get_text() | |
g = Graph() | |
g.parse(data=myJsonLd, format='json-ld', base=u) | |
g.close() | |
for s,p,o in g.triples( (None, None, None) ): | |
print "%s %s %s"%(s,p,o) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment