nfaggian/XML crawl

## XML crawl
from xml.etree import cElementTree
from urlparse import urljoin

import requests

# namespaces for XML parsing
thredds = "http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0"
xlink = "http://www.w3.org/1999/xlink"

def crawl(catalog):
    r = requests.get(catalog)
    xml = cElementTree.fromstring(r.content)

    # depth first traversal
    for subdir in xml.iterfind(".//{%s}catalogRef" % thredds):
        link = subdir.attrib["{%s}href" % xlink]
        for dataset in crawl(urljoin(catalog, link)):
            yield dataset

    for dataset in xml.iterfind(".//{%s}dataset[@urlPath]" % thredds):
        yield dataset
	from xml.etree import cElementTree
	from urlparse import urljoin

	import requests

	# namespaces for XML parsing
	thredds = "http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0"
	xlink = "http://www.w3.org/1999/xlink"

	def crawl(catalog):
	r = requests.get(catalog)
	xml = cElementTree.fromstring(r.content)

	# depth first traversal
	for subdir in xml.iterfind(".//{%s}catalogRef" % thredds):
	link = subdir.attrib["{%s}href" % xlink]
	for dataset in crawl(urljoin(catalog, link)):
	yield dataset

	for dataset in xml.iterfind(".//{%s}dataset[@urlPath]" % thredds):
	yield dataset