Skip to content

Instantly share code, notes, and snippets.

@cfilipov
Created November 6, 2017 04:11
Show Gist options
  • Save cfilipov/f6d092046f351c9e467c8c0c6dbbc3c8 to your computer and use it in GitHub Desktop.
Save cfilipov/f6d092046f351c9e467c8c0c6dbbc3c8 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import urllib2
def crawl(url):
html = urllib2.urlopen(url).read()
soup = BeautifulSoup(html)
rows = soup.findAll("table")[3].findAll("tr")
for row in rows[1:]:
cols = row.findAll("td")
ahref = cols[2].findAll("a")[0]
ewdnum = cols[0].b.string
ewdnumpart = "".join([c for c in ewdnum if c.isdigit()])
ewdcharpart = "".join([c for c in ewdnum if c.isalpha()])
ewdpadded = ewdnumpart.zfill(4)
if ewdnumpart == ewdnum:
ewdnum = ewdpadded
else:
ewdnum = ewdpadded + ewdcharpart
ewdname = "EWD" + ewdnum + " " + ahref.contents[0]
filename = "".join([c for c in ewdname if c.isalpha() or c.isdigit() or c==' ' or c=="(" or c==")" or c=="-"])
filename = filename + ".pdf"
pdfUrl = baseUrl + ahref["href"]
pdfFile = urllib2.urlopen(pdfUrl)
print "Saving: " + filename
output = open(filename, "wb")
output.write(pdfFile.read())
output.close()
return
baseUrl = "http://www.cs.utexas.edu/~EWD/"
startPage = "index00xx.html"
html = urllib2.urlopen(baseUrl + startPage).read()
soup = BeautifulSoup(html)
links = soup.findAll("table")[1].findAll("tr")[1].findAll("p")[2].findAll(href=True)
for link in links:
crawl(baseUrl + link["href"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment