Skip to content

Instantly share code, notes, and snippets.

@wtznc wtznc/hinton.py
Last active Jul 13, 2019

Embed
What would you like to do?
Web scraper for Geoffrey Hinton's publications (all PDFs +- 250MB)
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
URL = "http://www.cs.toronto.edu/~hinton/papers.html"
response = requests.get(URL)
if(response.status_code == 200):
print("Correctly downloaded website\nExtracting source code...")
src = response.text
soup = BeautifulSoup(src)
print("Finished!")
print("Extracting papers...\n")
papers = []
raw_papers = soup.find('table').find_all('tr')
for row in raw_papers:
year = row.td.text
if row.select('td')[1].b != None:
title = row.select('td')[1].b.text
title = " ".join(title.split())
else:
title = "title_missing"
authors = row.contents[2].contents[0]
authors = " ".join(authors.split())
if row.find('a', href=True) != None:
paper_url = row.find('a', href=True).attrs['href']
else:
paper_url = "missing"
print("Year: " + str(year) + "; Authors: " + str(authors) + "; " + "Title: " + title + "; URL = " + str(paper_url) + "\n")
papers.append([str(year), str(authors), str(title), str(paper_url)])
print("Finished preprocessing articles!")
print("Removing whitespaces from the name field...")
for x in range(0, len(papers)):
for r in (("\n", ""), ("\r", ""), (" ", "_"), (",",""), (".","")):
papers[x][1] = papers[x][1].replace(*r)
print("Done!")
print("Now let's remove some whitespaces from the titles")
for x in range(0, len(papers)):
for r in (("\n", ""), ("\r", ""), (" ", "_"), (",", ""), (".", "")):
papers[x][2] = papers[x][2].replace(*r)
print("We're all set!")
print('\nAn example of a paper from our list:\n', papers[0])
@wtznc

This comment has been minimized.

Copy link
Owner Author

commented Jul 13, 2019

refactored

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.