Skip to content

Instantly share code, notes, and snippets.

@sudkumar
Created October 5, 2016 14:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sudkumar/129e4528031c43fd875065781b3848d0 to your computer and use it in GitHub Desktop.
Save sudkumar/129e4528031c43fd875065781b3848d0 to your computer and use it in GitHub Desktop.
# import the required stuff
import urllib2
import lxml.html
import urlparse
pdfLinks = []
def getContentType(connection):
meta = connection.info()
return meta.getheader("Content-Type")
def isPDF(connection):
contentType = getContentType(connection)
return contentType == "application/pdf"
def downloadFile(fileName, connection):
# create a file and put the content data in it
file = open(fileName, "wb")
meta = connection.info()
fileSize = int(meta.getheaders("Content-Length")[0])
print "Downloading... '%s' of Bytes : '%s'" % (fileName, fileSize)
# set a block size and indicator for downloaded file size
blockSize = 8192
downloadedFileSize = 0
while True:
# read the file in a buffer
buffer = connection.read(blockSize)
if not buffer:
# oh!! we are done
break
downloadedFileSize += len(buffer)
file.write(buffer)
# show the download status to the user
status = r"%10d [%3.2f%%]" % (downloadedFileSize, downloadedFileSize*100. /fileSize)
status += chr(8)*(len(status)+1)
print status
# and we are done
# close the file
file.close()
def downloadIfPDF(url):
print "visiting: "+url
# try to connect to the provided url
try:
connection = urllib2.urlopen(url)
# catch the exception
except Exception, e:
# let's not raise the exception
# instead, show it to the user, and let's continue
print e
return False
else:
if isPDF(connection):
pdfLinks.append(url)
# we got the pdf
print "Got pdf file at "+url
print "Getting filename..."
# get the last element from the url as filename
fileName = url.split("/")[-1]
downloadFile(fileName, connection)
# return true that we have downloaded the file
return True
# it wasn't pdf file,
# but we can return the connection =D =D
return connection
# get the url from the user as input
connectionUrl = raw_input("Please enter the url and hit enter:\n")
downloadable = downloadIfPDF(connectionUrl)
# if the url is not a pdf url
if downloadable == True:
print "Download complete"
exit()
# something went wrong with the url connetion
if downloadable == False:
print "exiting now"
exit()
# else
# create the dom string from the connection
dom = lxml.html.fromstring(downloadable.read())
# select thr url in href for all <a> tags
for link in dom.xpath('//a/@href'):
# download if it is of type pdf
downloadable = downloadIfPDF(link)
if downloadable == False:
# many websites uses relative urls
# also try to access that
print "trying to attach the relative path..."
domainName = urlparse.urljoin(connectionUrl, '/')
downloadable = downloadIfPDF(domainName+link)
if downloadable == False:
print "Noop. That's a invalid url!! Moving on..."
if len(pdfLinks) == 0:
print "No PDF links found at "+connectionUrl
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment