Skip to content

Instantly share code, notes, and snippets.

@soeirosantos
Created October 9, 2012 17:10
Show Gist options
  • Save soeirosantos/3860096 to your computer and use it in GitHub Desktop.
Save soeirosantos/3860096 to your computer and use it in GitHub Desktop.
simple script for download pdfs from a specific page
"""
simple script for download pdfs from a specific page
depends of BeautifulSoup http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
import urllib2
import urllib
from bs4 import BeautifulSoup
#url from where are the pdfs
path = 'http://some/interesting/place/with/pdfs'
#specific page, if there is one
page = 'somepage.html'
page_path = path + "/" + page
all_links = BeautifulSoup("".join(urllib2.urlopen(page_path).readlines())).find_all("a")
def getFileName(href):
return href.split("/")[-1]
def completeUrl(href, path):
if "http://" not in href:
return path + "/" + href
else:
return href
for link in all_links:
href = link.get("href")
if ".pdf" in href:
print "urllib.urlretrieve(completeUrl(href, path), getFileName(href))"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment