Skip to content

Instantly share code, notes, and snippets.

@amrrs
Forked from wtznc/hinton.py
Created May 27, 2019 11:51
Show Gist options
  • Save amrrs/0deaaedfa825dbd0388d174f0e011868 to your computer and use it in GitHub Desktop.
Save amrrs/0deaaedfa825dbd0388d174f0e011868 to your computer and use it in GitHub Desktop.
Web scrapper for Geoffrey Hinton's publications, +- 250MB
# download this file
# install bs4, re, urllib
# run python3 hinton.py
from urllib import request
import re
import os
from bs4 import BeautifulSoup
def main():
curdir = os.getcwd()
url = "http://www.cs.toronto.edu/~hinton/papers.html"
print("Fetching articles information: ", url)
with request.urlopen(url) as response:
page = response.read().decode('utf-8')
soup = BeautifulSoup(page, 'html.parser')
links = soup.find_all('a', href=re.compile(r'(\.pdf)'))
weird = []
correct = []
for l in links:
if(l.attrs['href'][0] == 'h'):
weird.append(l.attrs['href'])
else:
correct.append(l.attrs['href'])
for a in correct:
fullurl = "http://www.cs.toronto.edu/~hinton/" + str(a)
print(fullurl)
if(fullurl[-5:][0] == '/'):
pass
else:
try:
request.urlretrieve(fullurl, curdir+"/"+str(a)[5:])
except IOError as e:
print('error')
except Exception as e:
print('error')
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment