cfilipov/crawl_dijkstra_archive.py

## crawl_dijkstra_archive.py
from bs4 import BeautifulSoup
import urllib2

def crawl(url):
	html = urllib2.urlopen(url).read()
	soup = BeautifulSoup(html)
	rows = soup.findAll("table")[3].findAll("tr")
	for row in rows[1:]:
		cols = row.findAll("td")
		ahref = cols[2].findAll("a")[0]
		ewdnum = cols[0].b.string
		ewdnumpart = "".join([c for c in ewdnum if c.isdigit()])
		ewdcharpart = "".join([c for c in ewdnum if c.isalpha()])
		ewdpadded = ewdnumpart.zfill(4)
		if ewdnumpart == ewdnum:
			ewdnum = ewdpadded
		else:
			ewdnum = ewdpadded + ewdcharpart
		ewdname = "EWD" + ewdnum + " " + ahref.contents[0]
		filename = "".join([c for c in ewdname if c.isalpha() or c.isdigit() or c==' ' or c=="(" or c==")" or c=="-"])
		filename = filename + ".pdf"
		pdfUrl = baseUrl + ahref["href"]
		pdfFile = urllib2.urlopen(pdfUrl)
		print "Saving: " + filename
		output = open(filename, "wb")
		output.write(pdfFile.read())
		output.close()
	return

baseUrl = "http://www.cs.utexas.edu/~EWD/"
startPage = "index00xx.html"
html = urllib2.urlopen(baseUrl + startPage).read()
soup = BeautifulSoup(html)
links = soup.findAll("table")[1].findAll("tr")[1].findAll("p")[2].findAll(href=True)

for link in links:
	crawl(baseUrl + link["href"])
	from bs4 import BeautifulSoup
	import urllib2

	def crawl(url):
	html = urllib2.urlopen(url).read()
	soup = BeautifulSoup(html)
	rows = soup.findAll("table")[3].findAll("tr")
	for row in rows[1:]:
	cols = row.findAll("td")
	ahref = cols[2].findAll("a")[0]
	ewdnum = cols[0].b.string
	ewdnumpart = "".join([c for c in ewdnum if c.isdigit()])
	ewdcharpart = "".join([c for c in ewdnum if c.isalpha()])
	ewdpadded = ewdnumpart.zfill(4)
	if ewdnumpart == ewdnum:
	ewdnum = ewdpadded
	else:
	ewdnum = ewdpadded + ewdcharpart
	ewdname = "EWD" + ewdnum + " " + ahref.contents[0]
	filename = "".join([c for c in ewdname if c.isalpha() or c.isdigit() or c==' ' or c=="(" or c==")" or c=="-"])
	filename = filename + ".pdf"
	pdfUrl = baseUrl + ahref["href"]
	pdfFile = urllib2.urlopen(pdfUrl)
	print "Saving: " + filename
	output = open(filename, "wb")
	output.write(pdfFile.read())
	output.close()
	return

	baseUrl = "http://www.cs.utexas.edu/~EWD/"
	startPage = "index00xx.html"
	html = urllib2.urlopen(baseUrl + startPage).read()
	soup = BeautifulSoup(html)
	links = soup.findAll("table")[1].findAll("tr")[1].findAll("p")[2].findAll(href=True)

	for link in links:
	crawl(baseUrl + link["href"])