Last active
December 12, 2015 08:48
-
-
Save nanaze/4746456 to your computer and use it in GitHub Desktop.
simple script to pull stanford lecture videos locally
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xml.dom | |
import os | |
import re | |
import urllib2 | |
import logging | |
import urllib | |
import mechanize | |
import getpass | |
import html5lib | |
import string | |
import sys | |
import shutil | |
import tempfile | |
import os | |
def make_filename(str): | |
valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits) | |
return ''.join(c for c in str if c in valid_chars) | |
def get_filename(url): | |
regex_str = r'/([^/]+\.[a-z0-9]{3})\?' | |
match = re.search(regex_str, url) | |
if match: | |
filename = match.group(1) | |
filename = urllib2.unquote(filename) | |
return filename | |
def getText(node): | |
rc = [] | |
for node in node.childNodes: | |
if node.nodeType == node.TEXT_NODE: | |
rc.append(node.data) | |
return ''.join(rc) | |
def yield_children(node): | |
for child in node.childNodes: | |
yield child | |
for n in yield_children(child): | |
yield n | |
def yield_child_elements(elem): | |
for child in yield_children(elem): | |
if child.nodeType == xml.dom.Node.ELEMENT_NODE: | |
yield child | |
def yield_course_elems(doc): | |
for elem in yield_child_elements(doc): | |
if elem.getAttribute('class') == 'course-list-item': | |
yield elem | |
def get_child_with_class(node, classname): | |
for child in yield_child_elements(node): | |
if child.getAttribute('class') == classname: | |
return child | |
def get_title(class_elem): | |
left = get_child_with_class(class_elem, 'pull-left') | |
for a in left.getElementsByTagName('a'): | |
return getText(a) | |
def get_url(class_elem): | |
right = get_child_with_class(class_elem, 'pull-right') | |
for a in right.getElementsByTagName('a'): | |
url = a.getAttribute('href') | |
if 'amazonaws' in url and '/large/' in url: | |
return url | |
def yield_videos(htmlstr): | |
doc = html5lib.parse(htmlstr, treebuilder='dom') | |
for video_elem in yield_course_elems(doc): | |
title = get_title(video_elem) | |
url = get_url(video_elem) | |
if url: | |
yield title, url | |
else: | |
logging.warning('Did not find URL for %s', title) | |
def get_url_size(url): | |
site = urllib.urlopen(url) | |
meta = site.info() | |
return int(meta.getheaders("Content-Length")[0]) | |
def log_network_update(blocks, block_size, total_size): | |
byte_count = blocks * block_size | |
line = 'Blocks: %s Block size:%s Total bytes: %s/%s' % (blocks, block_size, byte_count, total_size) | |
sys.stderr.write(line) | |
sys.stderr.write('\b' * len(line)) | |
def fetch_file(url, save_path): | |
logging.info('Fetching file %s, source %s', save_path, url) | |
_, temp_file = tempfile.mkstemp() | |
urllib.urlretrieve(url, temp_file, reporthook=log_network_update) | |
shutil.move(temp_file, save_path) | |
def main(): | |
logging.basicConfig(level=logging.DEBUG) | |
logging.info('logging in...') | |
br = mechanize.Browser() | |
br.open("https://class.stanford.edu/accounts/login?next=/networking/Fall2012") | |
br.select_form(nr=0) | |
br["username"] = raw_input('Username: ') | |
br["password"] = getpass.getpass() | |
response = br.submit() | |
logging.info('Scrapeing page for image links') | |
br.open('http://class.stanford.edu/networking/Fall2012/videos') | |
for title, url in yield_videos(br.response().get_data()): | |
logging.info('Found title %s with URL %s', title, url) | |
filename = get_filename(url) | |
download_name = make_filename(title + ' ' + filename) | |
# If the file exists and is roughly the same size, don't bother downloading. | |
if os.path.exists(download_name): | |
local_size = os.path.getsize(download_name) | |
server_size = get_url_size(url) | |
if abs(local_size - server_size) < 1024: | |
logging.info('File %s exists, skipping.', download_name) | |
continue | |
else: | |
logging.info('Local file seems different than server, deleting. %s', download_name) | |
os.remove(download_name) | |
fetch_file(url, download_name) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment