Skip to content

Instantly share code, notes, and snippets.

@R3DDY97
Last active December 4, 2017 09:16
Show Gist options
  • Save R3DDY97/2bfdae8b512212a513c4c7b04af1571d to your computer and use it in GitHub Desktop.
Save R3DDY97/2bfdae8b512212a513c4c7b04af1571d to your computer and use it in GitHub Desktop.
Downloads all pdfs from webpage
#!/usr/bin/env python3
def Usage():
print(
"""
Usage:-
Downloads all the pdf links from a given webpage
python pdf-dl.py url <path/to/directory>
path is optional
will save in the current directory if no path is given or given path does not exist
Requires:- requests and lxml
pip install -U requests
pip install -U lxml
""")
__author__= 'R3DDY97'
__license__= 'MIT'
__version__= '1.0.0'
import requests
from lxml import html
from urllib.parse import urljoin
from sys import argv
import os
import logging
def get_pdf_links(base_url):
try:
url_text = requests.get(base_url).text
except ConnectionError:
print("\nCheck the entered url...... \n\n")
logging.warning(' ConnectionError \n Check the entered url ')
tree = html.fromstring(url_text)
atags = tree.cssselect("a")
pdf_src =[a.attrib["href"] for a in atags if ".pdf" in a.attrib["href"]]
if len(pdf_src) == 0:
logging.warning('\nNo pdf links found on the webpage.\n')
print("\nNo pdf links found in {}\n ".format(base_url))
else:
global pdf_links
pdf_links = [urljoin(base_url,pdf_href) for pdf_href in pdf_src]
print("\nFound {} pdf links... \n".format(len(pdf_links)))
def get_pdf(base_dir=os.getcwd()):
try:
print("\nDownloading {} pdf's in \n{}\n".format(len(pdf_links), base_dir))
for link in pdf_links:
pdf_name = link.split("/")[-1]
pdf_path = os.path.join(base_dir,pdf_name)
pdf_stream = requests.get(link, stream = True)
pdf_raw = requests.get(link).content
print("Started downloading {}...\n".format(pdf_name))
with open(pdf_path, 'wb') as pdf:
pdf.write(pdf_raw)
print("Finished downloading {}\n".format(pdf_name))
except ConnectionError:
print("\nThe pdf link error\n\n")
print("\nCompleted Downloads!!...\nCheck {} folder for pdfs".format(base_dir))
# def progress_bar(link):
# link_metadata = requests.get(link, stream = True)
# file_size = round(float(content.headers['Content-length']) / 1024), 2, sep=','))
def main():
os.system("clear||cls")
if len(argv) not in (2, 3):
print('Error! Invalid arguments')
Usage()
raise SystemExit
# assigning base_url and base_dir variables
base_url = argv[1]
if len(argv) == 3 and os.path.isdir(str(argv[2])):
base_dir= "{}/Downloaded_pdfs".format(os.path.abspath(argv[2]))
else:
base_dir= "{}/Downloaded_pdfs".format(os.getcwd())
if not os.path.isdir(base_dir):
os.mkdir(base_dir)
# log the session
logging.basicConfig(filename=os.path.join(base_dir,'pdf_downloader.log'),level=logging.INFO,format='%(asctime)s %(message)s')
get_pdf_links(base_url)
get_pdf(base_dir)
if __name__=='__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment