Skip to content

Instantly share code, notes, and snippets.

@zhangqiaorjc
Created February 3, 2014 00:57
Show Gist options
  • Save zhangqiaorjc/8777429 to your computer and use it in GitHub Desktop.
Save zhangqiaorjc/8777429 to your computer and use it in GitHub Desktop.
Download all annotated lectures slides from UW CSE 546 website
import urllib2
import re
import threading
class crawler_thread(threading.Thread):
def __init__(self, url, filepath):
threading.Thread.__init__(self)
self.url = url
self.filepath = filepath
def run(self):
fid = open(self.filepath, 'w')
fid.write(urllib2.urlopen(self.url).read())
fid.close()
# download html containing file links
download_directory = '/Users/qiao/Dropbox/cse546/slides/'
source_html_url = 'http://courses.cs.washington.edu/courses/cse546/13au/schedule.html'
# get html
source_html = urllib2.urlopen(source_html_url).read()
# find all filenames
file_url_pattern = re.compile('slides/.*annotated.pdf')
list_filenames = file_url_pattern.findall(source_html)
# compile full url for each filenames
threads = []
base_url = 'http://courses.cs.washington.edu/courses/cse546/13au/'
for filename in list_filenames:
url = base_url + filename
filename_on_disk = download_directory + filename.split('/')[1]
t = crawler_thread(url, filename_on_disk)
t.start()
threads.append(t)
for t in threads:
t.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment