Skip to content

Instantly share code, notes, and snippets.

@nfaggian
Created July 6, 2013 09:32
Show Gist options
  • Save nfaggian/5939378 to your computer and use it in GitHub Desktop.
Save nfaggian/5939378 to your computer and use it in GitHub Desktop.
Pydap XML crawler
from xml.etree import cElementTree
from urlparse import urljoin
import requests
# namespaces for XML parsing
thredds = "http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0"
xlink = "http://www.w3.org/1999/xlink"
def crawl(catalog):
r = requests.get(catalog)
xml = cElementTree.fromstring(r.content)
# depth first traversal
for subdir in xml.iterfind(".//{%s}catalogRef" % thredds):
link = subdir.attrib["{%s}href" % xlink]
for dataset in crawl(urljoin(catalog, link)):
yield dataset
for dataset in xml.iterfind(".//{%s}dataset[@urlPath]" % thredds):
yield dataset
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment