hibetterheyj/all_pdf_dl.py

## all_pdf_dl.py
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import argparse

#%% Example
# one pdf
#   python all_pdf_dl.py -l https://memento.epfl.ch/academic-calendar/ --save-here
# many pdfs
#   python all_pdf_dl.py -l https://idsc.ethz.ch/education/lectures/recursive-estimation.html

#%% TODO
# rewrite as a function [okay]
# add argparse [okay]
#   print name all note
#   set save folder
#   TODO: download all files
# update running examples [okay]
# TODO: merge files with the same name
# TODO: save subfoldname as webpagename -> change all webpage as underscore case
# TODO: update to other type of files using class
# TODO: download files with name containing non-latin characters
# TODO: add tqdm viz
# TODO: add file selection
# TODO: add log-in
# TODO: modify according to style guide

#%% Functions
def all_pdf_download(args):
    base_url = args.link
    if args.save_here:
        folder_path = os.getcwd()
    else:
        folder_path = args.folder_path
        if not os.path.exists(args.folder_path):os.mkdir(args.folder_path)
    print("====== 1. Set savepath: {} ======".format(folder_path))
    print("====== 2. Start searching ======")
    #response = requests.get(base_url)
    response = requests.get(base_url, headers={'User-Agent': 'Custom'})
    soup= BeautifulSoup(response.text, "html.parser")
    search_res = soup.select("a[href$='.pdf']")
    print("{} files found!!!".format(len(search_res)))
    print("====== 3. Start downloading ======")
    for counter, link in enumerate(search_res):
        #Name the pdf files using the last portion of each link which are unique in this case
        filename = link['href'].split('/')[-1]
        file_save_path = os.path.join(folder_path,link['href'].split('/')[-1])
        if args.print_all:
            print("[{}/{}] {}".format(counter+1, len(search_res), filename))
        with open(file_save_path, 'wb') as f:
            f.write(requests.get(urljoin(base_url,link['href'])).content)
    print("====== 4. Finished!!! ======")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Test argparse')
    ####################################
    ############ ALL OPTION ############
    ## Main option
    # -l/--link
    parser.add_argument('-l', '--link', required=True, type=str,
                        help='write down site name')
    # --print-all
    parser.add_argument('--print-all', dest='print_all', action='store_true',
                        help="print all filename")
    parser.set_defaults(print_all=True)
    # --save-here
    parser.add_argument('--save-here', dest='save_here', action='store_true',
                        help="save files here")
    parser.set_defaults(save_here=False)
    # --save--folder
    # default setting -> Downloads/ in user’s home directory obtained by (os.path.expanduser('~'))
    parser.add_argument('-f', '--folder_path', default=r""+os.path.join(os.path.expanduser('~'), "Downloads"),
                        type=str, help='save files in the given folder')

    ########################################
    ############ PARSING OPTION ############
    args = parser.parse_args()
    all_pdf_download(args)

#%% reference
# from https://stackoverflow.com/questions/54616638/download-all-pdf-files-from-a-website-using-python
	import os
	import requests
	from urllib.parse import urljoin
	from bs4 import BeautifulSoup
	import argparse

	#%% Example
	# one pdf
	# python all_pdf_dl.py -l https://memento.epfl.ch/academic-calendar/ --save-here
	# many pdfs
	# python all_pdf_dl.py -l https://idsc.ethz.ch/education/lectures/recursive-estimation.html

	#%% TODO
	# rewrite as a function [okay]
	# add argparse [okay]
	# print name all note
	# set save folder
	# TODO: download all files
	# update running examples [okay]
	# TODO: merge files with the same name
	# TODO: save subfoldname as webpagename -> change all webpage as underscore case
	# TODO: update to other type of files using class
	# TODO: download files with name containing non-latin characters
	# TODO: add tqdm viz
	# TODO: add file selection
	# TODO: add log-in
	# TODO: modify according to style guide

	#%% Functions
	def all_pdf_download(args):
	base_url = args.link
	if args.save_here:
	folder_path = os.getcwd()
	else:
	folder_path = args.folder_path
	if not os.path.exists(args.folder_path):os.mkdir(args.folder_path)
	print("====== 1. Set savepath: {} ======".format(folder_path))
	print("====== 2. Start searching ======")
	#response = requests.get(base_url)
	response = requests.get(base_url, headers={'User-Agent': 'Custom'})
	soup= BeautifulSoup(response.text, "html.parser")
	search_res = soup.select("a[href$='.pdf']")
	print("{} files found!!!".format(len(search_res)))
	print("====== 3. Start downloading ======")
	for counter, link in enumerate(search_res):
	#Name the pdf files using the last portion of each link which are unique in this case
	filename = link['href'].split('/')[-1]
	file_save_path = os.path.join(folder_path,link['href'].split('/')[-1])
	if args.print_all:
	print("[{}/{}] {}".format(counter+1, len(search_res), filename))
	with open(file_save_path, 'wb') as f:
	f.write(requests.get(urljoin(base_url,link['href'])).content)
	print("====== 4. Finished!!! ======")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description='Test argparse')
	####################################
	############ ALL OPTION ############
	## Main option
	# -l/--link
	parser.add_argument('-l', '--link', required=True, type=str,
	help='write down site name')
	# --print-all
	parser.add_argument('--print-all', dest='print_all', action='store_true',
	help="print all filename")
	parser.set_defaults(print_all=True)
	# --save-here
	parser.add_argument('--save-here', dest='save_here', action='store_true',
	help="save files here")
	parser.set_defaults(save_here=False)
	# --save--folder
	# default setting -> Downloads/ in user’s home directory obtained by (os.path.expanduser('~'))
	parser.add_argument('-f', '--folder_path', default=r""+os.path.join(os.path.expanduser('~'), "Downloads"),
	type=str, help='save files in the given folder')

	########################################
	############ PARSING OPTION ############
	args = parser.parse_args()
	all_pdf_download(args)

	#%% reference
	# from https://stackoverflow.com/questions/54616638/download-all-pdf-files-from-a-website-using-python