Skip to content

Instantly share code, notes, and snippets.

@IanCarrasco
Last active July 19, 2019 00:52
Show Gist options
  • Save IanCarrasco/db2cb29f52e3d34434d39209580e2849 to your computer and use it in GitHub Desktop.
Save IanCarrasco/db2cb29f52e3d34434d39209580e2849 to your computer and use it in GitHub Desktop.
Stanford CS224n Scraper
import requests
from bs4 import BeautifulSoup
from PyPDF2 import PdfFileMerger, PdfFileReader
from io import BytesIO
#Parse page with beautiful soup
origin = 'https://web.stanford.edu/class/cs224n/readings/'
page = requests.get(origin)
content = BeautifulSoup(page.text)
#Compile web links for all pdfs in directory
pdf_links = []
for elem in content.findAll('a'):
if 'pdf' in elem['href'] and 'cs' in elem['href']:
pdf_links.append(origin+elem['href'])
#Download All PDFs
pdfs = []
for link in pdf_links:
memory = BytesIO(requests.get(link).content)
pdfs.append(PdfFileReader(memory))
#Merge Downloaded PDFs
merger = PdfFileMerger()
for pdf in pdfs:
merger.append(pdf)
#Export merged pdf to outfile
with open('cs224n_notes.pdf', 'wb') as fout:
merger.write(fout)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment