Skip to content

Instantly share code, notes, and snippets.

@mluerig
Created January 13, 2020 09:47
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save mluerig/3ec0a506afeb5a10439cff314f3786ee to your computer and use it in GitHub Desktop.
Save mluerig/3ec0a506afeb5a10439cff314f3786ee to your computer and use it in GitHub Desktop.
paperpile shared folder download pdf crawler
# this is not a standalone script, i.e. needs to be run from some IDE
import os
from bs4 import BeautifulSoup
import re
shared_folder_url = "" # your shared folder url
save_dir = "D:\\Temp\\pdfs"
all_urls = []
for page in ["","/2", "/3", "/4"]: # if you have multiple pages
base_url = shared_folder_url + page
req = urllib.request.urlopen(base_url)
soup = BeautifulSoup(req, from_encoding=resp.info().get_param('charset'))
for link in soup.find_all("a", href=re.compile("download")):
print(link['href'])
all_urls.append("https://paperpile.com" + link['href'])
idx = 1
for pdf_link in all_urls:
print(pdf_link)
try:
urllib.request.urlretrieve(pdf_link, os.path.join(save_dir, str(idx) + ".pdf"))
idx += 1
except Exception as ex:
print(str(ex.__class__.__name__) + " - " + str(ex))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment