wtznc/hinton.py

## hinton.py
import requests
import urllib.request
import time
from bs4 import BeautifulSoup

URL = "http://www.cs.toronto.edu/~hinton/papers.html"
response = requests.get(URL)
if(response.status_code == 200):
    print("Correctly downloaded website\nExtracting source code...")
    src = response.text
    soup = BeautifulSoup(src)
    print("Finished!")
print("Extracting papers...\n")
papers = []
raw_papers = soup.find('table').find_all('tr')

for row in raw_papers:
    year = row.td.text
    if row.select('td')[1].b != None:
        title = row.select('td')[1].b.text
        title = " ".join(title.split())

    else:
        title = "title_missing"

    authors = row.contents[2].contents[0]
    authors = " ".join(authors.split())
    if row.find('a', href=True) != None:
        paper_url = row.find('a', href=True).attrs['href']
    else:
        paper_url = "missing"

    print("Year: " + str(year) + "; Authors: " + str(authors) + "; " + "Title: " + title + "; URL = " + str(paper_url) + "\n")
    papers.append([str(year), str(authors), str(title), str(paper_url)])
print("Finished preprocessing articles!")
print("Removing whitespaces from the name field...")
for x in range(0, len(papers)):
    for r in (("\n", ""), ("\r", ""), (" ", "_"), (",",""), (".","")):
        papers[x][1] = papers[x][1].replace(*r)
print("Done!")
print("Now let's remove some whitespaces from the titles")
for x in range(0, len(papers)):
    for r in (("\n", ""), ("\r", ""), (" ", "_"), (",", ""), (".", "")):
        papers[x][2] = papers[x][2].replace(*r)
print("We're all set!")
print('\nAn example of a paper from our list:\n', papers[0])
	import requests
	import urllib.request
	import time
	from bs4 import BeautifulSoup

	URL = "http://www.cs.toronto.edu/~hinton/papers.html"
	response = requests.get(URL)
	if(response.status_code == 200):
	print("Correctly downloaded website\nExtracting source code...")
	src = response.text
	soup = BeautifulSoup(src)
	print("Finished!")
	print("Extracting papers...\n")
	papers = []
	raw_papers = soup.find('table').find_all('tr')

	for row in raw_papers:
	year = row.td.text
	if row.select('td')[1].b != None:
	title = row.select('td')[1].b.text
	title = " ".join(title.split())

	else:
	title = "title_missing"

	authors = row.contents[2].contents[0]
	authors = " ".join(authors.split())
	if row.find('a', href=True) != None:
	paper_url = row.find('a', href=True).attrs['href']
	else:
	paper_url = "missing"

	print("Year: " + str(year) + "; Authors: " + str(authors) + "; " + "Title: " + title + "; URL = " + str(paper_url) + "\n")
	papers.append([str(year), str(authors), str(title), str(paper_url)])
	print("Finished preprocessing articles!")
	print("Removing whitespaces from the name field...")
	for x in range(0, len(papers)):
	for r in (("\n", ""), ("\r", ""), (" ", "_"), (",",""), (".","")):
	papers[x][1] = papers[x][1].replace(*r)
	print("Done!")
	print("Now let's remove some whitespaces from the titles")
	for x in range(0, len(papers)):
	for r in (("\n", ""), ("\r", ""), (" ", "_"), (",", ""), (".", "")):
	papers[x][2] = papers[x][2].replace(*r)
	print("We're all set!")
	print('\nAn example of a paper from our list:\n', papers[0])