sudkumar/PdfFromAUrlDownloader.py

## PdfFromAUrlDownloader.py
# import the required stuff
import urllib2
import lxml.html
import urlparse

pdfLinks = []

def getContentType(connection):
    meta = connection.info()
    return meta.getheader("Content-Type")


def isPDF(connection):
    contentType = getContentType(connection)
    return contentType == "application/pdf"


def downloadFile(fileName, connection):


    # create a file and put the content data in it
    file = open(fileName, "wb")
    meta = connection.info()
    fileSize = int(meta.getheaders("Content-Length")[0])

    print "Downloading... '%s' of Bytes : '%s'" % (fileName, fileSize)

    # set a block size and indicator for downloaded file size
    blockSize = 8192
    downloadedFileSize = 0

    while True:
        # read the file in a buffer
        buffer = connection.read(blockSize)
        if not buffer:
            # oh!! we are done
            break

        downloadedFileSize += len(buffer)
        file.write(buffer)

        # show the download status to the user
        status = r"%10d [%3.2f%%]" % (downloadedFileSize, downloadedFileSize*100. /fileSize)
        status += chr(8)*(len(status)+1)
        print status

    # and we are done
    # close the file
    file.close()

def downloadIfPDF(url):
    print "visiting: "+url
    # try to connect to the provided url
    try:
        connection = urllib2.urlopen(url)

    # catch the exception
    except Exception, e:

        # let's not raise the exception
        # instead, show it to the user, and let's continue
        print e
        return False


    else:
        if isPDF(connection):
            pdfLinks.append(url)
            # we got the pdf
            print "Got pdf file at "+url

            print "Getting filename..."
            # get the last element from the url as filename
            fileName = url.split("/")[-1]
            downloadFile(fileName, connection)

            # return true that we have downloaded the file
            return True


        # it wasn't pdf file,
        # but we can return the connection =D =D
        return connection


# get the url from the user as input
connectionUrl = raw_input("Please enter the url and hit enter:\n")

downloadable = downloadIfPDF(connectionUrl)

# if the url is not a pdf url
if downloadable == True:
    print "Download complete"
    exit()

# something went wrong with the url connetion
if downloadable == False:
    print "exiting now"
    exit()


# else
# create the dom string from the connection
dom = lxml.html.fromstring(downloadable.read())

# select thr url in href for all <a> tags
for link in dom.xpath('//a/@href'):

    # download if it is of type pdf
    downloadable = downloadIfPDF(link)


    if downloadable == False:

        # many websites uses relative urls
        # also try to access that
        print "trying to attach the relative path..."
        domainName = urlparse.urljoin(connectionUrl, '/')
        downloadable = downloadIfPDF(domainName+link)

        if downloadable == False:
            print "Noop. That's a invalid url!! Moving on..."


if len(pdfLinks) == 0:
    print "No PDF links found at "+connectionUrl
	# import the required stuff
	import urllib2
	import lxml.html
	import urlparse

	pdfLinks = []

	def getContentType(connection):
	meta = connection.info()
	return meta.getheader("Content-Type")


	def isPDF(connection):
	contentType = getContentType(connection)
	return contentType == "application/pdf"


	def downloadFile(fileName, connection):



	# create a file and put the content data in it
	file = open(fileName, "wb")
	meta = connection.info()
	fileSize = int(meta.getheaders("Content-Length")[0])

	print "Downloading... '%s' of Bytes : '%s'" % (fileName, fileSize)

	# set a block size and indicator for downloaded file size
	blockSize = 8192
	downloadedFileSize = 0

	while True:
	# read the file in a buffer
	buffer = connection.read(blockSize)
	if not buffer:
	# oh!! we are done
	break

	downloadedFileSize += len(buffer)
	file.write(buffer)

	# show the download status to the user
	status = r"%10d [%3.2f%%]" % (downloadedFileSize, downloadedFileSize*100. /fileSize)
	status += chr(8)*(len(status)+1)
	print status

	# and we are done
	# close the file
	file.close()

	def downloadIfPDF(url):
	print "visiting: "+url
	# try to connect to the provided url
	try:
	connection = urllib2.urlopen(url)

	# catch the exception
	except Exception, e:

	# let's not raise the exception
	# instead, show it to the user, and let's continue
	print e
	return False


	else:
	if isPDF(connection):
	pdfLinks.append(url)
	# we got the pdf
	print "Got pdf file at "+url

	print "Getting filename..."
	# get the last element from the url as filename
	fileName = url.split("/")[-1]
	downloadFile(fileName, connection)

	# return true that we have downloaded the file
	return True



	# it wasn't pdf file,
	# but we can return the connection =D =D
	return connection



	# get the url from the user as input
	connectionUrl = raw_input("Please enter the url and hit enter:\n")

	downloadable = downloadIfPDF(connectionUrl)

	# if the url is not a pdf url
	if downloadable == True:
	print "Download complete"
	exit()

	# something went wrong with the url connetion
	if downloadable == False:
	print "exiting now"
	exit()


	# else
	# create the dom string from the connection
	dom = lxml.html.fromstring(downloadable.read())

	# select thr url in href for all <a> tags
	for link in dom.xpath('//a/@href'):

	# download if it is of type pdf
	downloadable = downloadIfPDF(link)


	if downloadable == False:

	# many websites uses relative urls
	# also try to access that
	print "trying to attach the relative path..."
	domainName = urlparse.urljoin(connectionUrl, '/')
	downloadable = downloadIfPDF(domainName+link)

	if downloadable == False:
	print "Noop. That's a invalid url!! Moving on..."


	if len(pdfLinks) == 0:
	print "No PDF links found at "+connectionUrl