fils/microToRDF.py

## microToRDF.py
from creepy import Crawler

# todo:  write to a file, filter out the css related triples

class MyCrawler(Crawler):
    def process_document(self, doc):
        if doc.status == 200:
            #print '[%d] %s' % (doc.status, doc.url)
            # There is also doc.text one could work with
            packet = urllib.quote_plus(doc.url)
            # Set the format=ttl to any of valid any23 formats; json, rdfxml, ttl, ntriples, nquads, trix
            # could pass body text too to the other any23 API
            jsonout = urlopen("http://data.oceandrilling.org/any23/any23/?format=nquads&uri="+ packet + "&validation-mode=none#").read()
            print  jsonout
        else:
            pass

crawler = MyCrawler()
crawler.set_follow_mode(Crawler.F_SAME_HOST)
crawler.add_url_filter('\.(jpg|jpeg|gif|png|js|css|swf)$')
crawler.crawl('http://paleoseek.net')
	from creepy import Crawler

	# todo: write to a file, filter out the css related triples

	class MyCrawler(Crawler):
	def process_document(self, doc):
	if doc.status == 200:
	#print '[%d] %s' % (doc.status, doc.url)
	# There is also doc.text one could work with
	packet = urllib.quote_plus(doc.url)
	# Set the format=ttl to any of valid any23 formats; json, rdfxml, ttl, ntriples, nquads, trix
	# could pass body text too to the other any23 API
	jsonout = urlopen("http://data.oceandrilling.org/any23/any23/?format=nquads&uri="+ packet + "&validation-mode=none#").read()
	print jsonout
	else:
	pass

	crawler = MyCrawler()
	crawler.set_follow_mode(Crawler.F_SAME_HOST)
	crawler.add_url_filter('\.(jpg\|jpeg\|gif\|png\|js\|css\|swf)$')
	crawler.crawl('http://paleoseek.net')