Skip to content

Instantly share code, notes, and snippets.

@hibetterheyj
Last active April 11, 2024 12:54
Show Gist options
  • Save hibetterheyj/21bcd6c3ed3aa8896005862fe9212ff2 to your computer and use it in GitHub Desktop.
Save hibetterheyj/21bcd6c3ed3aa8896005862fe9212ff2 to your computer and use it in GitHub Desktop.
Download all pdf files from a website
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import argparse
#%% Example
# one pdf
# python all_pdf_dl.py -l https://memento.epfl.ch/academic-calendar/ --save-here
# many pdfs
# python all_pdf_dl.py -l https://idsc.ethz.ch/education/lectures/recursive-estimation.html
#%% TODO
# rewrite as a function [okay]
# add argparse [okay]
# print name all note
# set save folder
# TODO: download all files
# update running examples [okay]
# TODO: merge files with the same name
# TODO: save subfoldname as webpagename -> change all webpage as underscore case
# TODO: update to other type of files using class
# TODO: download files with name containing non-latin characters
# TODO: add tqdm viz
# TODO: add file selection
# TODO: add log-in
# TODO: modify according to style guide
#%% Functions
def all_pdf_download(args):
base_url = args.link
if args.save_here:
folder_path = os.getcwd()
else:
folder_path = args.folder_path
if not os.path.exists(args.folder_path):os.mkdir(args.folder_path)
print("====== 1. Set savepath: {} ======".format(folder_path))
print("====== 2. Start searching ======")
#response = requests.get(base_url)
response = requests.get(base_url, headers={'User-Agent': 'Custom'})
soup= BeautifulSoup(response.text, "html.parser")
search_res = soup.select("a[href$='.pdf']")
print("{} files found!!!".format(len(search_res)))
print("====== 3. Start downloading ======")
for counter, link in enumerate(search_res):
#Name the pdf files using the last portion of each link which are unique in this case
filename = link['href'].split('/')[-1]
file_save_path = os.path.join(folder_path,link['href'].split('/')[-1])
if args.print_all:
print("[{}/{}] {}".format(counter+1, len(search_res), filename))
with open(file_save_path, 'wb') as f:
f.write(requests.get(urljoin(base_url,link['href'])).content)
print("====== 4. Finished!!! ======")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Test argparse')
####################################
############ ALL OPTION ############
## Main option
# -l/--link
parser.add_argument('-l', '--link', required=True, type=str,
help='write down site name')
# --print-all
parser.add_argument('--print-all', dest='print_all', action='store_true',
help="print all filename")
parser.set_defaults(print_all=True)
# --save-here
parser.add_argument('--save-here', dest='save_here', action='store_true',
help="save files here")
parser.set_defaults(save_here=False)
# --save--folder
# default setting -> Downloads/ in user’s home directory obtained by (os.path.expanduser('~'))
parser.add_argument('-f', '--folder_path', default=r""+os.path.join(os.path.expanduser('~'), "Downloads"),
type=str, help='save files in the given folder')
########################################
############ PARSING OPTION ############
args = parser.parse_args()
all_pdf_download(args)
#%% reference
# from https://stackoverflow.com/questions/54616638/download-all-pdf-files-from-a-website-using-python
@hibetterheyj
Copy link
Author

@woyzeck1978 maybe you could find related scripts by using Selenium, such as this link~

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment