Last active
December 4, 2017 09:16
-
-
Save R3DDY97/2bfdae8b512212a513c4c7b04af1571d to your computer and use it in GitHub Desktop.
Downloads all pdfs from webpage
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
def Usage(): | |
print( | |
""" | |
Usage:- | |
Downloads all the pdf links from a given webpage | |
python pdf-dl.py url <path/to/directory> | |
path is optional | |
will save in the current directory if no path is given or given path does not exist | |
Requires:- requests and lxml | |
pip install -U requests | |
pip install -U lxml | |
""") | |
__author__= 'R3DDY97' | |
__license__= 'MIT' | |
__version__= '1.0.0' | |
import requests | |
from lxml import html | |
from urllib.parse import urljoin | |
from sys import argv | |
import os | |
import logging | |
def get_pdf_links(base_url): | |
try: | |
url_text = requests.get(base_url).text | |
except ConnectionError: | |
print("\nCheck the entered url...... \n\n") | |
logging.warning(' ConnectionError \n Check the entered url ') | |
tree = html.fromstring(url_text) | |
atags = tree.cssselect("a") | |
pdf_src =[a.attrib["href"] for a in atags if ".pdf" in a.attrib["href"]] | |
if len(pdf_src) == 0: | |
logging.warning('\nNo pdf links found on the webpage.\n') | |
print("\nNo pdf links found in {}\n ".format(base_url)) | |
else: | |
global pdf_links | |
pdf_links = [urljoin(base_url,pdf_href) for pdf_href in pdf_src] | |
print("\nFound {} pdf links... \n".format(len(pdf_links))) | |
def get_pdf(base_dir=os.getcwd()): | |
try: | |
print("\nDownloading {} pdf's in \n{}\n".format(len(pdf_links), base_dir)) | |
for link in pdf_links: | |
pdf_name = link.split("/")[-1] | |
pdf_path = os.path.join(base_dir,pdf_name) | |
pdf_stream = requests.get(link, stream = True) | |
pdf_raw = requests.get(link).content | |
print("Started downloading {}...\n".format(pdf_name)) | |
with open(pdf_path, 'wb') as pdf: | |
pdf.write(pdf_raw) | |
print("Finished downloading {}\n".format(pdf_name)) | |
except ConnectionError: | |
print("\nThe pdf link error\n\n") | |
print("\nCompleted Downloads!!...\nCheck {} folder for pdfs".format(base_dir)) | |
# def progress_bar(link): | |
# link_metadata = requests.get(link, stream = True) | |
# file_size = round(float(content.headers['Content-length']) / 1024), 2, sep=',')) | |
def main(): | |
os.system("clear||cls") | |
if len(argv) not in (2, 3): | |
print('Error! Invalid arguments') | |
Usage() | |
raise SystemExit | |
# assigning base_url and base_dir variables | |
base_url = argv[1] | |
if len(argv) == 3 and os.path.isdir(str(argv[2])): | |
base_dir= "{}/Downloaded_pdfs".format(os.path.abspath(argv[2])) | |
else: | |
base_dir= "{}/Downloaded_pdfs".format(os.getcwd()) | |
if not os.path.isdir(base_dir): | |
os.mkdir(base_dir) | |
# log the session | |
logging.basicConfig(filename=os.path.join(base_dir,'pdf_downloader.log'),level=logging.INFO,format='%(asctime)s %(message)s') | |
get_pdf_links(base_url) | |
get_pdf(base_dir) | |
if __name__=='__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment