Skip to content

Instantly share code, notes, and snippets.

@jslee02
Forked from mluerig/paperpile_crawler.py
Last active July 21, 2021 17:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jslee02/f876cfc171f9a84cb4d8fa3d3331d4cf to your computer and use it in GitHub Desktop.
Save jslee02/f876cfc171f9a84cb4d8fa3d3331d4cf to your computer and use it in GitHub Desktop.
paperpile shared folder download pdf crawler
# This is a snippet for downloading PDFs to local computer from Paperfile.
# See https://forum.paperpile.com/t/download-multiple-pdfs-to-computer/2405/7 for the details.
#
# Summary:
# 1. In your paperfile account, create a shared folder and move the papers you want to download their pdfs
# 2. Run the below script with adding the link to the shared folder. Install dependencies if needed:
# pip3 install urllib3 -U
# pip3 install beautifulsoup4 -U
#
# Tested with Python 3.9.5.
import os
from bs4 import BeautifulSoup
import re
import urllib.request
# settings
shared_folder_url = "" # your shared folder url
save_dir = ""
end_page = 1 # update to the end page number of the shared folder
pages = [""]
for i in range(2, end_page):
pages.append("/" + str(i))
all_urls = []
for page in pages:
base_url = shared_folder_url + page
req = urllib.request.urlopen(base_url)
soup = BeautifulSoup(req, from_encoding=req.info().get_param('charset'))
for link in soup.find_all("a", href=re.compile("download")):
print(link['href'])
all_urls.append("https://paperpile.com" + link['href'])
idx = 1
for pdf_link in all_urls:
print(pdf_link)
try:
urllib.request.urlretrieve(pdf_link,
os.path.join(save_dir,
str(idx) + ".pdf"))
idx += 1
except Exception as ex:
print(str(ex.__class__.__name__) + " - " + str(ex))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment