Created
October 5, 2016 14:52
-
-
Save sudkumar/129e4528031c43fd875065781b3848d0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import the required stuff | |
import urllib2 | |
import lxml.html | |
import urlparse | |
pdfLinks = [] | |
def getContentType(connection): | |
meta = connection.info() | |
return meta.getheader("Content-Type") | |
def isPDF(connection): | |
contentType = getContentType(connection) | |
return contentType == "application/pdf" | |
def downloadFile(fileName, connection): | |
# create a file and put the content data in it | |
file = open(fileName, "wb") | |
meta = connection.info() | |
fileSize = int(meta.getheaders("Content-Length")[0]) | |
print "Downloading... '%s' of Bytes : '%s'" % (fileName, fileSize) | |
# set a block size and indicator for downloaded file size | |
blockSize = 8192 | |
downloadedFileSize = 0 | |
while True: | |
# read the file in a buffer | |
buffer = connection.read(blockSize) | |
if not buffer: | |
# oh!! we are done | |
break | |
downloadedFileSize += len(buffer) | |
file.write(buffer) | |
# show the download status to the user | |
status = r"%10d [%3.2f%%]" % (downloadedFileSize, downloadedFileSize*100. /fileSize) | |
status += chr(8)*(len(status)+1) | |
print status | |
# and we are done | |
# close the file | |
file.close() | |
def downloadIfPDF(url): | |
print "visiting: "+url | |
# try to connect to the provided url | |
try: | |
connection = urllib2.urlopen(url) | |
# catch the exception | |
except Exception, e: | |
# let's not raise the exception | |
# instead, show it to the user, and let's continue | |
print e | |
return False | |
else: | |
if isPDF(connection): | |
pdfLinks.append(url) | |
# we got the pdf | |
print "Got pdf file at "+url | |
print "Getting filename..." | |
# get the last element from the url as filename | |
fileName = url.split("/")[-1] | |
downloadFile(fileName, connection) | |
# return true that we have downloaded the file | |
return True | |
# it wasn't pdf file, | |
# but we can return the connection =D =D | |
return connection | |
# get the url from the user as input | |
connectionUrl = raw_input("Please enter the url and hit enter:\n") | |
downloadable = downloadIfPDF(connectionUrl) | |
# if the url is not a pdf url | |
if downloadable == True: | |
print "Download complete" | |
exit() | |
# something went wrong with the url connetion | |
if downloadable == False: | |
print "exiting now" | |
exit() | |
# else | |
# create the dom string from the connection | |
dom = lxml.html.fromstring(downloadable.read()) | |
# select thr url in href for all <a> tags | |
for link in dom.xpath('//a/@href'): | |
# download if it is of type pdf | |
downloadable = downloadIfPDF(link) | |
if downloadable == False: | |
# many websites uses relative urls | |
# also try to access that | |
print "trying to attach the relative path..." | |
domainName = urlparse.urljoin(connectionUrl, '/') | |
downloadable = downloadIfPDF(domainName+link) | |
if downloadable == False: | |
print "Noop. That's a invalid url!! Moving on..." | |
if len(pdfLinks) == 0: | |
print "No PDF links found at "+connectionUrl | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment