Skip to content

Instantly share code, notes, and snippets.

@gozzilli
Last active December 7, 2017 07:38
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gozzilli/93a6c611d124d09b36d4224ae445a352 to your computer and use it in GitHub Desktop.
Save gozzilli/93a6c611d124d09b36d4224ae445a352 to your computer and use it in GitHub Desktop.
Extract the first page of all NIPS papers and combine them in one PDF file
"""Extract the first page of all NIPS papers and combine them in one PDF file
:author: Davide Zilli
:date: 03 Dec 2017
:lastedit: 03 Dec 2017
"""
import os
import requests
from PyPDF2 import PdfFileWriter, PdfFileReader
from bs4 import BeautifulSoup
from io import BytesIO
NIPS = "https://papers.nips.cc"
proceedings_url = NIPS + "/book/advances-in-neural-information-processing-systems-30-2017"
papers_dir = "/tmp/NIPS2017/papers"
proceedings_page_path = os.path.join(papers_dir, "proceedings.html")
abstracts_filename = os.path.join(papers_dir, "NIPS2017-abstracts.pdf")
def get_page(url: str, path: str) -> BeautifulSoup:
"""Get a web page and store it, or load it from disk if present"""
if os.path.exists(path):
with open(path) as p:
page = p.read()
else:
page = requests.get(url).text
with open(path, 'w') as p:
p.write(page)
return BeautifulSoup(page)
def get_pdf(url: str, path: str) -> PdfFileReader:
"""Get a PDF from the web and store it, or load it from disk if present"""
if os.path.exists(path):
reader = PdfFileReader(path)
else:
pdf = requests.get(url)
reader = PdfFileReader(BytesIO(pdf.content))
writer = PdfFileWriter()
writer.appendPagesFromReader(reader)
with open(path, 'wb') as p:
writer.write(p)
return reader
def extract() -> None:
"""Extract and combine the first page of all proceedings' PDFs into one"""
if not os.path.exists(papers_dir):
os.makedirs(papers_dir)
proceedings_soup = get_page(proceedings_url, proceedings_page_path)
paper_links = proceedings_soup.find_all("a")
print("found {} links".format(len(paper_links)))
scanned = 0
errors = []
with open(abstracts_filename, "wb") as abstracts_pdf:
writer = PdfFileWriter()
for link in paper_links:
if link["href"].startswith("/paper/"):
try:
scanned += 1
print("\n*** [{:d}] paper page found".format(scanned))
print(link["href"])
paper_soup = get_page(NIPS + link["href"], os.path.join(papers_dir, os.path.basename(link["href"])))
pdf_links = paper_soup.find_all("a")
pdf_a_link = next(filter(lambda x: x.text == "[PDF]", pdf_links))
pdf_link = pdf_a_link["href"]
print("found PDF link:", pdf_link)
pdf = get_pdf(NIPS + pdf_link, os.path.join(papers_dir, os.path.basename(pdf_link)))
writer.addPage(pdf.getPage(0))
print("Added paper '{}'".format(paper_soup.find_all("h2", attrs={"class": "subtitle"})[0].text))
except (MemoryError, KeyboardInterrupt):
break
except Exception as e:
print("Failed to add paper: '{}' ({})".format(link["href"], e))
errors.append(link["href"])
writer.write(abstracts_pdf)
print("\nerrors:\n", errors)
print("output in '{}'".format(abstracts_filename))
if __name__ == '__main__':
extract()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment