This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Crawl O'Reilly's Book Catalog, extract RDFa about the books, and stash away | |
triples in a rdflib BerkeleyDB store. | |
You will need the trunk version of rdflib installed, or otherwise available. | |
You also will need html5lib for the lax, tagsoup parsing--O'Reilly's html | |
for its book pages isn't well-formed at the moment. | |
""" | |
import re | |
import urllib | |
from rdflib.graph import ConjunctiveGraph | |
from rdflib.term import URIRef | |
catalog_urls = [ | |
"http://oreilly.com/store/complete.html", | |
"http://oreilly.com/store/complete2.html", | |
"http://oreilly.com/store/complete3.html", | |
"http://oreilly.com/store/complete4.html", | |
] | |
graph = ConjunctiveGraph('Sleepycat') | |
graph.open('store', create=True) | |
for catalog_url in catalog_urls: | |
html = urllib.urlopen(catalog_url).read() | |
for book_url in re.findall(r'"(http://oreilly.com/catalog/\d+/)"', html): | |
# TODO: make this smarter, crawl if running at a different time | |
if URIRef(book_url) in graph.subjects(): | |
continue | |
print "fetching url=%s [current graph size=%s]" % (book_url, len(graph)) | |
# some urls in the catalog 404 believe it or not | |
try: | |
graph.parse(location=book_url, format='rdfa', lax=True) | |
except Exception, e: | |
print e | |
graph.serialize(open('catalog.rdf', 'w')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment