Skip to content

Instantly share code, notes, and snippets.

@Airtnp
Last active December 21, 2020 21:59
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Airtnp/f6d4ce2ee116dab9ffdfbd3f11c3e762 to your computer and use it in GitHub Desktop.
Save Airtnp/f6d4ce2ee116dab9ffdfbd3f11c3e762 to your computer and use it in GitHub Desktop.
PDF_download.py
from urllib.request import urlretrieve
import requests
from bs4 import BeautifulSoup
import sys
import os
import socket
socket.setdefaulttimeout(150)
import re
class DownloadError:
pass
def show_block_fn(fn):
def show_block(a, b, c):
per = 100.0 * a * b / c
if per > 100:
per = 100
if per < 0:
raise DownloadError()
bl = "[" + "*" * int(per / 2.5) + "-" * (40 - int(per / 2.5)) + "]"
print("\t%s:%s %.2f%%" % (fn, bl, per))
return show_block
def download_file(url, idx, local_filename, folder):
local_filename = local_filename.replace("%20", " ")
# NOTE the stream=True parameter
r = requests.get(url, stream=True)
try:
urlretrieve(url, folder + '/' + local_filename, show_block_fn(local_filename))
except:
print("Error: " + url)
return None
def download_pdf(root_link, folder, download_prefix = None):
suffix = ['.pdf', '.ppt', '.pptx', '.doc', '.docx', '.tar.gz', '.zip', '.rar']
try:
os.mkdir(folder)
except:
print("Folder already exists: {}".format(folder))
r = requests.get(root_link)
if r.status_code == 200:
soup = BeautifulSoup(r.text, 'lxml')
idx = 1
for link in soup.find_all('a'):
url = link.get('href')
if url != None:
if not download_prefix:
if url.startswith('http'):
new_link = url
else:
if root_link.endswith('.html'):
new_link = '/'.join(root_link.split('/')[:-1]) + '/' + link.get('href')
else:
new_link = root_link + '/' + link.get('href')
else:
new_link = download_prefix + '/' + link.get('href')
for suf in suffix:
reg = r"([-_.\w]+)\{}([?&].*)*".format(suf)
m = re.search(reg, new_link)
if m and m.group(0):
print("\nDownloading: " + new_link + " -> " + m.group(1) + suf)
try:
download_file(new_link,str(idx), (m.group(1) + suf), folder)
except Exception as e:
print("Failed to download url {}".format(new_link))
idx += 1
print("All download finished")
else:
print("A errors occurs.")
if __name__ == "__main__":
if (len(sys.argv) == 3):
download_pdf(sys.argv[1], sys.argv[2])
else:
download_pdf(sys.argv[1], sys.argv[2], sys.argv[3])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment