Skip to content

Instantly share code, notes, and snippets.

@royshil
Last active March 22, 2020 01:27
Show Gist options
  • Star 8 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save royshil/e888d9cab632801ce15c4d5a479710e0 to your computer and use it in GitHub Desktop.
Save royshil/e888d9cab632801ce15c4d5a479710e0 to your computer and use it in GitHub Desktop.
Download All PDFs in a URL using Python mechanize
# This is kind-of based off of this: http://stackoverflow.com/questions/5974595/download-all-the-linksrelated-documents-on-a-webpage-using-python
import cookielib
import urllib2
import mechanize
from time import sleep
import os
import cgi
# A routine to download a file from a link, by simulating a click on it
def downloadlink(linkUrl, referer):
r = br.click_link(linkUrl)
r.add_header("Referer", referer) # add a referer header, just in case
response = br.open(r)
#get filename from the response headers if possible
cdheader = response.info().getheader('Content-Disposition')
if cdheader:
value, params = cgi.parse_header(cdheader)
filename = params["filename"]
else:
# if not use the link's basename
filename = os.path.basename(linkUrl.url)
f = open(filename, "w") #TODO: perhaps ensure that file doesn't already exist?
f.write(response.read()) # write the response content to disk
print filename," has been downloaded"
br.back()
# Make a Browser (think of this as chrome or firefox etc)
br = mechanize.Browser()
# Enable cookie support for urllib2
cookiejar = cookielib.LWPCookieJar()
br.set_cookiejar( cookiejar )
# Broser options
br.set_handle_equiv( True )
br.set_handle_gzip( True )
br.set_handle_redirect( True )
br.set_handle_referer( True )
br.set_handle_robots( False )
br.set_handle_refresh( mechanize._http.HTTPRefreshProcessor(), max_time = 1 )
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] # masquerade as a real browser. this is not nice to do though.
# Open your site
mypageUrl = 'http://my.url.com/page'
br.open(mypageUrl)
print "Get all PDF links\n"
filetypes=["pdf", "PDF"] # pattern matching for links, can add more kinds here
myfiles=[]
for l in br.links():
#check if this link has the file extension or text we want
myfiles.extend([l for t in filetypes if t in l.url or t in l.text])
for l in myfiles:
# for index, l in zip(range(100), myfiles): # <--- uncomment this line (and coment the one above) to download 100 links.
#sleep(1) # uncomment to throttle downloads, so you dont hammer the site
downloadlink(l, mypageUrl)
@BruceDai003
Copy link

Does this work if I want to download the specified extension files? e.g., 'problem1.py', 'answer1.py', 'tutorial1.pptx', 'syllabus.pdf', etc?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment