Skip to content

Instantly share code, notes, and snippets.

@TRManderson
Created May 19, 2016 01:36
Show Gist options
  • Save TRManderson/68c29ba554c5e13cfd0b2bb28252f0b2 to your computer and use it in GitHub Desktop.
Save TRManderson/68c29ba554c5e13cfd0b2bb28252f0b2 to your computer and use it in GitHub Desktop.
Recursively download everything from a start page, so long as it starts with a given root url
import mechanize, os, errno
from urlparse import urldefrag
from time import sleep
def recursive_download(root, page, browser=mechanize.Browser(), visited=set()):
if page in visited or not page.startswith(root):
return
else:
visited.add(page)
fname = page[len(root):]
print "Visiting {}".format(fname)
try:
browser.open(page)
except:
return
if not os.path.exists(os.path.dirname(fname) or '.'):
try:
os.makedirs(os.path.dirname(fname) or '.')
except OSError as exc: # Guard against race condition
if exc.errno != errno.EEXIST:
raise
with open(fname, "w+") as f:
f.write(browser.response().read())
if br.viewing_html():
for link in browser.links():
url, _ = urldefrag(link.absolute_url)
if not url.startswith(root):
continue
sleep(0.5)
recursive_download(root, url, browser=browser, visited=visited)
# Example is COMP4403 Blackboard website
root = 'https://learn.uq.edu.au/bbcswebdav/pid-1704716-dt-content-rid-8397191_1/courses/COMP4403S_6620_20835/'
page = 'https://learn.uq.edu.au/bbcswebdav/pid-1704716-dt-content-rid-8397191_1/courses/COMP4403S_6620_20835/index.html'
recursive_download(root, page, browser=br)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment