Skip to content

Instantly share code, notes, and snippets.

@wtznc
Last active July 13, 2019 18:52
Show Gist options
  • Save wtznc/082ab294be6efb69cb4a3d9861e08963 to your computer and use it in GitHub Desktop.
Save wtznc/082ab294be6efb69cb4a3d9861e08963 to your computer and use it in GitHub Desktop.
Web scraper for Geoffrey Hinton's publications (all PDFs +- 250MB)
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
URL = "http://www.cs.toronto.edu/~hinton/papers.html"
response = requests.get(URL)
if(response.status_code == 200):
print("Correctly downloaded website\nExtracting source code...")
src = response.text
soup = BeautifulSoup(src)
print("Finished!")
print("Extracting papers...\n")
papers = []
raw_papers = soup.find('table').find_all('tr')
for row in raw_papers:
year = row.td.text
if row.select('td')[1].b != None:
title = row.select('td')[1].b.text
title = " ".join(title.split())
else:
title = "title_missing"
authors = row.contents[2].contents[0]
authors = " ".join(authors.split())
if row.find('a', href=True) != None:
paper_url = row.find('a', href=True).attrs['href']
else:
paper_url = "missing"
print("Year: " + str(year) + "; Authors: " + str(authors) + "; " + "Title: " + title + "; URL = " + str(paper_url) + "\n")
papers.append([str(year), str(authors), str(title), str(paper_url)])
print("Finished preprocessing articles!")
print("Removing whitespaces from the name field...")
for x in range(0, len(papers)):
for r in (("\n", ""), ("\r", ""), (" ", "_"), (",",""), (".","")):
papers[x][1] = papers[x][1].replace(*r)
print("Done!")
print("Now let's remove some whitespaces from the titles")
for x in range(0, len(papers)):
for r in (("\n", ""), ("\r", ""), (" ", "_"), (",", ""), (".", "")):
papers[x][2] = papers[x][2].replace(*r)
print("We're all set!")
print('\nAn example of a paper from our list:\n', papers[0])
@wtznc
Copy link
Author

wtznc commented Jul 13, 2019

refactored

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment