Skip to content

Instantly share code, notes, and snippets.

@jharjono
Created February 26, 2011 21:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jharjono/845627 to your computer and use it in GitHub Desktop.
Save jharjono/845627 to your computer and use it in GitHub Desktop.
Downloads all files of a certain file format on a web page to local filesystem
#!/usr/bin/env python
# Downloads all files of a certain file format on a web page to local filesystem
import urllib2
import lxml
import os
from lxml.html import fromstring
from urlparse import urljoin
from urllib import urlretrieve
class Downloader(object):
def __init__(self, url, output_dir, doctype):
self.url = url
self.page = urllib2.urlopen(self.url).read()
self.search_tree = fromstring(self.page)
self.output_dir = output_dir
self.doctype = doctype
def start(self):
links = self._get_doc_links(self.doctype)
for f in links:
self.download(f)
def _get_doc_links(self, doctype):
"""
Helper method to obtain all links of a certain doctype in ``self.url``
"""
pdf_links = []
# Ensure document type is a file extension
if not doctype.startswith("."):
doctype = "." + doctype
for a in self.search_tree.cssselect('a'):
href = a.get('href')
if href and href.endswith(doctype):
# We only need the last part of the URL
pdf_fname = href.split('/')[-1]
pdf_links.append(pdf_fname)
return pdf_links
def download(self, link):
"""
Does actual downloading to filesystem
"""
file_url = urljoin(self.url, link)
file_name = os.path.join(self.output_dir, link)
print "Downloading file %s to %s" %(file_url, file_name)
urlretrieve(file_url, file_name)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Downloads all documents of specific file format in a webpage")
parser.add_argument("-u", "--url", metavar="url",
help="the URL of the webpage", required=True, dest="url")
parser.add_argument("-t", "--type", metavar="type",
help="document type", required=True, dest="type")
parser.add_argument("-o", "--output-dir", metavar="output_dir",
help="output directory for all the scraped files, defaults to cwd", required=False, dest="output_dir")
args = parser.parse_args()
url = args.url
output_dir = args.output_dir if args.output_dir else os.getcwd()
doctype = args.type
downloader = Downloader(url, output_dir, doctype)
downloader.start()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment