Web scraper for Geoffrey Hinton's publications (all PDFs +- 250MB)
import requests | |
import urllib.request | |
import time | |
from bs4 import BeautifulSoup | |
URL = "http://www.cs.toronto.edu/~hinton/papers.html" | |
response = requests.get(URL) | |
if(response.status_code == 200): | |
print("Correctly downloaded website\nExtracting source code...") | |
src = response.text | |
soup = BeautifulSoup(src) | |
print("Finished!") | |
print("Extracting papers...\n") | |
papers = [] | |
raw_papers = soup.find('table').find_all('tr') | |
for row in raw_papers: | |
year = row.td.text | |
if row.select('td')[1].b != None: | |
title = row.select('td')[1].b.text | |
title = " ".join(title.split()) | |
else: | |
title = "title_missing" | |
authors = row.contents[2].contents[0] | |
authors = " ".join(authors.split()) | |
if row.find('a', href=True) != None: | |
paper_url = row.find('a', href=True).attrs['href'] | |
else: | |
paper_url = "missing" | |
print("Year: " + str(year) + "; Authors: " + str(authors) + "; " + "Title: " + title + "; URL = " + str(paper_url) + "\n") | |
papers.append([str(year), str(authors), str(title), str(paper_url)]) | |
print("Finished preprocessing articles!") | |
print("Removing whitespaces from the name field...") | |
for x in range(0, len(papers)): | |
for r in (("\n", ""), ("\r", ""), (" ", "_"), (",",""), (".","")): | |
papers[x][1] = papers[x][1].replace(*r) | |
print("Done!") | |
print("Now let's remove some whitespaces from the titles") | |
for x in range(0, len(papers)): | |
for r in (("\n", ""), ("\r", ""), (" ", "_"), (",", ""), (".", "")): | |
papers[x][2] = papers[x][2].replace(*r) | |
print("We're all set!") | |
print('\nAn example of a paper from our list:\n', papers[0]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This comment has been minimized.
wtznc commentedJul 13, 2019
refactored