jharjono/doc_downloader.py

## doc_downloader.py
#!/usr/bin/env python
# Downloads all files of a certain file format on a web page to local filesystem

import urllib2
import lxml
import os

from lxml.html import fromstring
from urlparse  import urljoin
from urllib    import urlretrieve

class Downloader(object):

    def __init__(self, url, output_dir, doctype):
        self.url = url
        self.page = urllib2.urlopen(self.url).read()
        self.search_tree = fromstring(self.page)
        self.output_dir = output_dir
        self.doctype = doctype

    def start(self):
        links = self._get_doc_links(self.doctype)
        for f in links:
            self.download(f)

    def _get_doc_links(self, doctype):
        """
        Helper method to obtain all links of a certain doctype in ``self.url``
        """
        pdf_links = []

        # Ensure document type is a file extension
        if not doctype.startswith("."):
            doctype = "." + doctype

        for a in self.search_tree.cssselect('a'):
            href = a.get('href')
            if href and href.endswith(doctype):
                # We only need the last part of the URL
                pdf_fname = href.split('/')[-1]
                pdf_links.append(pdf_fname)
        return pdf_links

    def download(self, link):
        """
        Does actual downloading to filesystem
        """
        file_url = urljoin(self.url, link)
        file_name = os.path.join(self.output_dir, link)
        print "Downloading file %s to %s" %(file_url, file_name)
        urlretrieve(file_url, file_name)


if __name__ == "__main__":

    import argparse

    parser = argparse.ArgumentParser(description="Downloads all documents of specific file format in a webpage")
    parser.add_argument("-u", "--url", metavar="url",
                        help="the URL of the webpage", required=True, dest="url")
    parser.add_argument("-t", "--type", metavar="type",
                        help="document type", required=True, dest="type")
    parser.add_argument("-o", "--output-dir", metavar="output_dir",
                        help="output directory for all the scraped files, defaults to cwd", required=False, dest="output_dir")


    args = parser.parse_args()
    url = args.url
    output_dir = args.output_dir if args.output_dir else os.getcwd()
    doctype = args.type

    downloader = Downloader(url, output_dir, doctype)
    downloader.start()
	#!/usr/bin/env python
	# Downloads all files of a certain file format on a web page to local filesystem

	import urllib2
	import lxml
	import os

	from lxml.html import fromstring
	from urlparse import urljoin
	from urllib import urlretrieve

	class Downloader(object):

	def __init__(self, url, output_dir, doctype):
	self.url = url
	self.page = urllib2.urlopen(self.url).read()
	self.search_tree = fromstring(self.page)
	self.output_dir = output_dir
	self.doctype = doctype

	def start(self):
	links = self._get_doc_links(self.doctype)
	for f in links:
	self.download(f)

	def _get_doc_links(self, doctype):
	"""
	Helper method to obtain all links of a certain doctype in ``self.url``
	"""
	pdf_links = []

	# Ensure document type is a file extension
	if not doctype.startswith("."):
	doctype = "." + doctype

	for a in self.search_tree.cssselect('a'):
	href = a.get('href')
	if href and href.endswith(doctype):
	# We only need the last part of the URL
	pdf_fname = href.split('/')[-1]
	pdf_links.append(pdf_fname)
	return pdf_links

	def download(self, link):
	"""
	Does actual downloading to filesystem
	"""
	file_url = urljoin(self.url, link)
	file_name = os.path.join(self.output_dir, link)
	print "Downloading file %s to %s" %(file_url, file_name)
	urlretrieve(file_url, file_name)


	if __name__ == "__main__":

	import argparse

	parser = argparse.ArgumentParser(description="Downloads all documents of specific file format in a webpage")
	parser.add_argument("-u", "--url", metavar="url",
	help="the URL of the webpage", required=True, dest="url")
	parser.add_argument("-t", "--type", metavar="type",
	help="document type", required=True, dest="type")
	parser.add_argument("-o", "--output-dir", metavar="output_dir",
	help="output directory for all the scraped files, defaults to cwd", required=False, dest="output_dir")


	args = parser.parse_args()
	url = args.url
	output_dir = args.output_dir if args.output_dir else os.getcwd()
	doctype = args.type

	downloader = Downloader(url, output_dir, doctype)
	downloader.start()