mluerig/paperpile_crawler.py

## paperpile_crawler.py
# this is not a standalone script, i.e. needs to be run from some IDE
import os
from bs4 import BeautifulSoup
import re

shared_folder_url = "" # your shared folder url
save_dir = "D:\\Temp\\pdfs"
all_urls = []

for page in ["","/2", "/3", "/4"]:      # if you have multiple pages
    base_url = shared_folder_url + page
    req =  urllib.request.urlopen(base_url)
    soup = BeautifulSoup(req, from_encoding=resp.info().get_param('charset'))

    for link in soup.find_all("a", href=re.compile("download")):
        print(link['href'])
        all_urls.append("https://paperpile.com" + link['href'])

idx = 1
for pdf_link in all_urls:
    print(pdf_link)
    try:
        urllib.request.urlretrieve(pdf_link, os.path.join(save_dir, str(idx) + ".pdf"))
        idx += 1
    except Exception as ex:
        print(str(ex.__class__.__name__) + " - " + str(ex))
	# this is not a standalone script, i.e. needs to be run from some IDE
	import os
	from bs4 import BeautifulSoup
	import re

	shared_folder_url = "" # your shared folder url
	save_dir = "D:\\Temp\\pdfs"
	all_urls = []

	for page in ["","/2", "/3", "/4"]: # if you have multiple pages
	base_url = shared_folder_url + page
	req = urllib.request.urlopen(base_url)
	soup = BeautifulSoup(req, from_encoding=resp.info().get_param('charset'))

	for link in soup.find_all("a", href=re.compile("download")):
	print(link['href'])
	all_urls.append("https://paperpile.com" + link['href'])

	idx = 1
	for pdf_link in all_urls:
	print(pdf_link)
	try:
	urllib.request.urlretrieve(pdf_link, os.path.join(save_dir, str(idx) + ".pdf"))
	idx += 1
	except Exception as ex:
	print(str(ex.__class__.__name__) + " - " + str(ex))