jslee02/paperpile_crawler.py

## paperpile_crawler.py
# This is a snippet for downloading PDFs to local computer from Paperfile.
# See https://forum.paperpile.com/t/download-multiple-pdfs-to-computer/2405/7 for the details.
#
# Summary:
# 1. In your paperfile account, create a shared folder and move the papers you want to download their pdfs
# 2. Run the below script with adding the link to the shared folder. Install dependencies if needed:
# pip3 install urllib3 -U
# pip3 install beautifulsoup4 -U
#
# Tested with Python 3.9.5.

import os
from bs4 import BeautifulSoup
import re
import urllib.request

# settings
shared_folder_url = ""  # your shared folder url
save_dir = ""
end_page = 1            # update to the end page number of the shared folder

pages = [""]
for i in range(2, end_page):
    pages.append("/" + str(i))

all_urls = []
for page in pages:
    base_url = shared_folder_url + page
    req = urllib.request.urlopen(base_url)
    soup = BeautifulSoup(req, from_encoding=req.info().get_param('charset'))

    for link in soup.find_all("a", href=re.compile("download")):
        print(link['href'])
        all_urls.append("https://paperpile.com" + link['href'])

idx = 1
for pdf_link in all_urls:
    print(pdf_link)
    try:
        urllib.request.urlretrieve(pdf_link,
                                   os.path.join(save_dir,
                                                str(idx) + ".pdf"))
        idx += 1
    except Exception as ex:
        print(str(ex.__class__.__name__) + " - " + str(ex))
	# This is a snippet for downloading PDFs to local computer from Paperfile.
	# See https://forum.paperpile.com/t/download-multiple-pdfs-to-computer/2405/7 for the details.
	#
	# Summary:
	# 1. In your paperfile account, create a shared folder and move the papers you want to download their pdfs
	# 2. Run the below script with adding the link to the shared folder. Install dependencies if needed:
	# pip3 install urllib3 -U
	# pip3 install beautifulsoup4 -U
	#
	# Tested with Python 3.9.5.

	import os
	from bs4 import BeautifulSoup
	import re
	import urllib.request

	# settings
	shared_folder_url = "" # your shared folder url
	save_dir = ""
	end_page = 1 # update to the end page number of the shared folder

	pages = [""]
	for i in range(2, end_page):
	pages.append("/" + str(i))

	all_urls = []
	for page in pages:
	base_url = shared_folder_url + page
	req = urllib.request.urlopen(base_url)
	soup = BeautifulSoup(req, from_encoding=req.info().get_param('charset'))

	for link in soup.find_all("a", href=re.compile("download")):
	print(link['href'])
	all_urls.append("https://paperpile.com" + link['href'])

	idx = 1
	for pdf_link in all_urls:
	print(pdf_link)
	try:
	urllib.request.urlretrieve(pdf_link,
	os.path.join(save_dir,
	str(idx) + ".pdf"))
	idx += 1
	except Exception as ex:
	print(str(ex.__class__.__name__) + " - " + str(ex))