edsu/oreilly.py

## oreilly.py
#!/usr/bin/env python

"""
Crawl O'Reilly's Book Catalog, extract RDFa about the books, and stash away
triples in a rdflib BerkeleyDB store.

You will need the trunk version of rdflib installed, or otherwise available.
You also will need html5lib for the lax, tagsoup parsing--O'Reilly's html
for its book pages isn't well-formed at the moment.
"""

import re
import urllib

from rdflib.graph import ConjunctiveGraph
from rdflib.term import URIRef

catalog_urls = [
               "http://oreilly.com/store/complete.html",
               "http://oreilly.com/store/complete2.html",
               "http://oreilly.com/store/complete3.html",
               "http://oreilly.com/store/complete4.html",
               ]

graph = ConjunctiveGraph('Sleepycat')
graph.open('store', create=True)

for catalog_url in catalog_urls:
    html = urllib.urlopen(catalog_url).read()
    for book_url in re.findall(r'"(http://oreilly.com/catalog/\d+/)"', html):
        # TODO: make this smarter, crawl if running at a different time
        if URIRef(book_url) in graph.subjects():
            continue
        print "fetching url=%s [current graph size=%s]" % (book_url, len(graph))
        # some urls in the catalog 404 believe it or not
        try:
            graph.parse(location=book_url, format='rdfa', lax=True)
        except Exception, e:
            print e

graph.serialize(open('catalog.rdf', 'w'))
	#!/usr/bin/env python

	"""
	Crawl O'Reilly's Book Catalog, extract RDFa about the books, and stash away
	triples in a rdflib BerkeleyDB store.

	You will need the trunk version of rdflib installed, or otherwise available.
	You also will need html5lib for the lax, tagsoup parsing--O'Reilly's html
	for its book pages isn't well-formed at the moment.
	"""

	import re
	import urllib

	from rdflib.graph import ConjunctiveGraph
	from rdflib.term import URIRef

	catalog_urls = [
	"http://oreilly.com/store/complete.html",
	"http://oreilly.com/store/complete2.html",
	"http://oreilly.com/store/complete3.html",
	"http://oreilly.com/store/complete4.html",
	]

	graph = ConjunctiveGraph('Sleepycat')
	graph.open('store', create=True)

	for catalog_url in catalog_urls:
	html = urllib.urlopen(catalog_url).read()
	for book_url in re.findall(r'"(http://oreilly.com/catalog/\d+/)"', html):
	# TODO: make this smarter, crawl if running at a different time
	if URIRef(book_url) in graph.subjects():
	continue
	print "fetching url=%s [current graph size=%s]" % (book_url, len(graph))
	# some urls in the catalog 404 believe it or not
	try:
	graph.parse(location=book_url, format='rdfa', lax=True)
	except Exception, e:
	print e

	graph.serialize(open('catalog.rdf', 'w'))