soeirosantos/downpdf.py

## downpdf.py
"""
simple script for download pdfs from a specific page
depends of BeautifulSoup http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""

import urllib2
import urllib
from bs4 import BeautifulSoup

#url from where are the pdfs
path = 'http://some/interesting/place/with/pdfs'

#specific page, if there is one
page = 'somepage.html'

page_path = path + "/" + page

all_links = BeautifulSoup("".join(urllib2.urlopen(page_path).readlines())).find_all("a")

def getFileName(href):
    return href.split("/")[-1]

def completeUrl(href, path):
    if "http://" not in href:
        return path + "/" + href
    else:
        return href

for link in all_links:
    href = link.get("href")
    if ".pdf" in href:
        print "urllib.urlretrieve(completeUrl(href, path), getFileName(href))"
	"""
	simple script for download pdfs from a specific page
	depends of BeautifulSoup http://www.crummy.com/software/BeautifulSoup/bs4/doc/
	"""

	import urllib2
	import urllib
	from bs4 import BeautifulSoup

	#url from where are the pdfs
	path = 'http://some/interesting/place/with/pdfs'

	#specific page, if there is one
	page = 'somepage.html'

	page_path = path + "/" + page

	all_links = BeautifulSoup("".join(urllib2.urlopen(page_path).readlines())).find_all("a")

	def getFileName(href):
	return href.split("/")[-1]

	def completeUrl(href, path):
	if "http://" not in href:
	return path + "/" + href
	else:
	return href

	for link in all_links:
	href = link.get("href")
	if ".pdf" in href:
	print "urllib.urlretrieve(completeUrl(href, path), getFileName(href))"