Skip to content

Instantly share code, notes, and snippets.

@nanaze
Last active December 12, 2015 08:48
Show Gist options
  • Save nanaze/4746456 to your computer and use it in GitHub Desktop.
Save nanaze/4746456 to your computer and use it in GitHub Desktop.
simple script to pull stanford lecture videos locally
import xml.dom
import os
import re
import urllib2
import logging
import urllib
import mechanize
import getpass
import html5lib
import string
import sys
import shutil
import tempfile
import os
def make_filename(str):
valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
return ''.join(c for c in str if c in valid_chars)
def get_filename(url):
regex_str = r'/([^/]+\.[a-z0-9]{3})\?'
match = re.search(regex_str, url)
if match:
filename = match.group(1)
filename = urllib2.unquote(filename)
return filename
def getText(node):
rc = []
for node in node.childNodes:
if node.nodeType == node.TEXT_NODE:
rc.append(node.data)
return ''.join(rc)
def yield_children(node):
for child in node.childNodes:
yield child
for n in yield_children(child):
yield n
def yield_child_elements(elem):
for child in yield_children(elem):
if child.nodeType == xml.dom.Node.ELEMENT_NODE:
yield child
def yield_course_elems(doc):
for elem in yield_child_elements(doc):
if elem.getAttribute('class') == 'course-list-item':
yield elem
def get_child_with_class(node, classname):
for child in yield_child_elements(node):
if child.getAttribute('class') == classname:
return child
def get_title(class_elem):
left = get_child_with_class(class_elem, 'pull-left')
for a in left.getElementsByTagName('a'):
return getText(a)
def get_url(class_elem):
right = get_child_with_class(class_elem, 'pull-right')
for a in right.getElementsByTagName('a'):
url = a.getAttribute('href')
if 'amazonaws' in url and '/large/' in url:
return url
def yield_videos(htmlstr):
doc = html5lib.parse(htmlstr, treebuilder='dom')
for video_elem in yield_course_elems(doc):
title = get_title(video_elem)
url = get_url(video_elem)
if url:
yield title, url
else:
logging.warning('Did not find URL for %s', title)
def get_url_size(url):
site = urllib.urlopen(url)
meta = site.info()
return int(meta.getheaders("Content-Length")[0])
def log_network_update(blocks, block_size, total_size):
byte_count = blocks * block_size
line = 'Blocks: %s Block size:%s Total bytes: %s/%s' % (blocks, block_size, byte_count, total_size)
sys.stderr.write(line)
sys.stderr.write('\b' * len(line))
def fetch_file(url, save_path):
logging.info('Fetching file %s, source %s', save_path, url)
_, temp_file = tempfile.mkstemp()
urllib.urlretrieve(url, temp_file, reporthook=log_network_update)
shutil.move(temp_file, save_path)
def main():
logging.basicConfig(level=logging.DEBUG)
logging.info('logging in...')
br = mechanize.Browser()
br.open("https://class.stanford.edu/accounts/login?next=/networking/Fall2012")
br.select_form(nr=0)
br["username"] = raw_input('Username: ')
br["password"] = getpass.getpass()
response = br.submit()
logging.info('Scrapeing page for image links')
br.open('http://class.stanford.edu/networking/Fall2012/videos')
for title, url in yield_videos(br.response().get_data()):
logging.info('Found title %s with URL %s', title, url)
filename = get_filename(url)
download_name = make_filename(title + ' ' + filename)
# If the file exists and is roughly the same size, don't bother downloading.
if os.path.exists(download_name):
local_size = os.path.getsize(download_name)
server_size = get_url_size(url)
if abs(local_size - server_size) < 1024:
logging.info('File %s exists, skipping.', download_name)
continue
else:
logging.info('Local file seems different than server, deleting. %s', download_name)
os.remove(download_name)
fetch_file(url, download_name)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment