Skip to content

Instantly share code, notes, and snippets.

@bsodhi
Created November 17, 2022 09:57
Show Gist options
  • Save bsodhi/9725cf269aa55d9b6d6f3c31e5bd8cd3 to your computer and use it in GitHub Desktop.
Save bsodhi/9725cf269aa55d9b6d6f3c31e5bd8cd3 to your computer and use it in GitHub Desktop.
Scaping PDF files
"""
Written with a lot of help from StackOverflow community
and Python API documentation -- greatly appreciated!
"""
import sys
import requests
import urllib3
import concurrent.futures
from bs4 import BeautifulSoup
from datetime import datetime as DT
from pathlib import Path
TPE = concurrent.futures.ThreadPoolExecutor(max_workers=6)
# Disable the SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
FIXED_UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:74.0) Gecko/20100101 Firefox/74.0"
DEBUG = False
HTTP_TIMEOUT_SEC = 5
HTTP_DELAY_SEC = 2
def log(msg):
ts = DT.now().strftime("%Y-%m-%d@%I:%M:%S%p")
print("[{0}] : {1}".format(ts, msg))
def debug(msg):
if DEBUG:
log(msg)
def make_http_request(url):
log("Requesting URL {0}. Delay {1}s".format(url, HTTP_DELAY_SEC))
return requests.get(url,
headers={'User-Agent': FIXED_UA},
timeout=HTTP_TIMEOUT_SEC)
def download_file(url, file_path):
if file_path.exists():
print("*** File {} already exists. Skipping.".format(file_path))
return
# open in binary mode
with open(file_path, "wb") as file:
# get request
response = make_http_request(url)
# write to file
file.write(response.content)
print("Downloaded file {0} from {1}".format(url, file_path))
def make_page_soup(page_url):
page = make_http_request(page_url)
if page.status_code == requests.codes.ok:
return BeautifulSoup(page.content, 'lxml')
else:
log("Failed to get page at URL {0}. Error: {1}".format(
page_url, page.reason))
def main(pg_url, out_dir):
try:
file_exts = (".pdf", ".doc", ".epub", ".docx")
soup = make_page_soup(pg_url)
for link in soup.find_all('a'):
url = str(link.get('href'))
name = link.text.strip().replace(" ", "_")
fp = Path(out_dir, name)
if url.lower().endswith(file_exts):
TPE.submit(download_file, url, fp)
except Exception as ex:
log("**** Error "+str(ex))
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: {0} INPUT_URL OUTPUT_DIR_PATH".format(
sys.argv[0]))
else:
main(sys.argv[1], sys.argv[2])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment