nanaze/download stanford class videos

## download stanford class videos
import xml.dom
import os
import re
import urllib2
import logging
import urllib
import mechanize
import getpass
import html5lib
import string
import sys
import shutil
import tempfile
import os

def make_filename(str):
  valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
  return ''.join(c for c in str if c in valid_chars)

def get_filename(url):
  regex_str = r'/([^/]+\.[a-z0-9]{3})\?'
  match = re.search(regex_str, url)
  if match:
    filename = match.group(1)
    filename = urllib2.unquote(filename)
    return filename

def getText(node):
  rc = []
  for node in node.childNodes:
    if node.nodeType == node.TEXT_NODE:
      rc.append(node.data)
  return ''.join(rc)

def yield_children(node):
  for child in node.childNodes:
    yield child
    for n in yield_children(child):
      yield n

def yield_child_elements(elem):
  for child in yield_children(elem):
    if child.nodeType == xml.dom.Node.ELEMENT_NODE:
      yield child

def yield_course_elems(doc):
  for elem in yield_child_elements(doc):
    if elem.getAttribute('class') == 'course-list-item':
      yield elem

def get_child_with_class(node, classname):
  for child in yield_child_elements(node):
    if child.getAttribute('class') == classname:
      return child

def get_title(class_elem):
  left = get_child_with_class(class_elem, 'pull-left')
  for a in left.getElementsByTagName('a'):
    return getText(a)

def get_url(class_elem):
  right = get_child_with_class(class_elem, 'pull-right')
  for a in right.getElementsByTagName('a'):
    url = a.getAttribute('href')
    if 'amazonaws' in url and '/large/' in url:
      return url


def yield_videos(htmlstr):
  doc = html5lib.parse(htmlstr, treebuilder='dom')

  for video_elem in yield_course_elems(doc):
    title = get_title(video_elem)
    url = get_url(video_elem)

    if url:
      yield title, url
    else:
      logging.warning('Did not find URL for %s', title)

def get_url_size(url):
  site = urllib.urlopen(url)
  meta = site.info()
  return int(meta.getheaders("Content-Length")[0])


def log_network_update(blocks, block_size, total_size):
  byte_count = blocks * block_size
  line = 'Blocks: %s Block size:%s Total bytes: %s/%s' % (blocks, block_size, byte_count, total_size)
  sys.stderr.write(line)
  sys.stderr.write('\b' * len(line))

def fetch_file(url, save_path):
  logging.info('Fetching file %s, source %s', save_path, url)
  _, temp_file = tempfile.mkstemp()
  urllib.urlretrieve(url, temp_file, reporthook=log_network_update)
  shutil.move(temp_file, save_path)


def main():
  logging.basicConfig(level=logging.DEBUG)

  logging.info('logging in...')
  br = mechanize.Browser()
  br.open("https://class.stanford.edu/accounts/login?next=/networking/Fall2012")
  br.select_form(nr=0)
  br["username"] = raw_input('Username: ')
  br["password"] = getpass.getpass()

  response = br.submit()

  logging.info('Scrapeing page for image links')
  br.open('http://class.stanford.edu/networking/Fall2012/videos')

  for title, url in yield_videos(br.response().get_data()):

    logging.info('Found title %s with URL %s', title, url)

    filename = get_filename(url)
    download_name = make_filename(title + ' ' + filename)

    # If the file exists and is roughly the same size, don't bother downloading.
    if os.path.exists(download_name):
      local_size = os.path.getsize(download_name)
      server_size = get_url_size(url)

      if abs(local_size - server_size) < 1024:
        logging.info('File %s exists, skipping.', download_name)
        continue
      else:
        logging.info('Local file seems different than server, deleting. %s', download_name)
        os.remove(download_name)

    fetch_file(url, download_name)


main()
	import xml.dom
	import os
	import re
	import urllib2
	import logging
	import urllib
	import mechanize
	import getpass
	import html5lib
	import string
	import sys
	import shutil
	import tempfile
	import os

	def make_filename(str):
	valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
	return ''.join(c for c in str if c in valid_chars)

	def get_filename(url):
	regex_str = r'/([^/]+\.[a-z0-9]{3})\?'
	match = re.search(regex_str, url)
	if match:
	filename = match.group(1)
	filename = urllib2.unquote(filename)
	return filename

	def getText(node):
	rc = []
	for node in node.childNodes:
	if node.nodeType == node.TEXT_NODE:
	rc.append(node.data)
	return ''.join(rc)

	def yield_children(node):
	for child in node.childNodes:
	yield child
	for n in yield_children(child):
	yield n

	def yield_child_elements(elem):
	for child in yield_children(elem):
	if child.nodeType == xml.dom.Node.ELEMENT_NODE:
	yield child

	def yield_course_elems(doc):
	for elem in yield_child_elements(doc):
	if elem.getAttribute('class') == 'course-list-item':
	yield elem

	def get_child_with_class(node, classname):
	for child in yield_child_elements(node):
	if child.getAttribute('class') == classname:
	return child

	def get_title(class_elem):
	left = get_child_with_class(class_elem, 'pull-left')
	for a in left.getElementsByTagName('a'):
	return getText(a)

	def get_url(class_elem):
	right = get_child_with_class(class_elem, 'pull-right')
	for a in right.getElementsByTagName('a'):
	url = a.getAttribute('href')
	if 'amazonaws' in url and '/large/' in url:
	return url


	def yield_videos(htmlstr):
	doc = html5lib.parse(htmlstr, treebuilder='dom')

	for video_elem in yield_course_elems(doc):
	title = get_title(video_elem)
	url = get_url(video_elem)

	if url:
	yield title, url
	else:
	logging.warning('Did not find URL for %s', title)

	def get_url_size(url):
	site = urllib.urlopen(url)
	meta = site.info()
	return int(meta.getheaders("Content-Length")[0])


	def log_network_update(blocks, block_size, total_size):
	byte_count = blocks * block_size
	line = 'Blocks: %s Block size:%s Total bytes: %s/%s' % (blocks, block_size, byte_count, total_size)
	sys.stderr.write(line)
	sys.stderr.write('\b' * len(line))

	def fetch_file(url, save_path):
	logging.info('Fetching file %s, source %s', save_path, url)
	_, temp_file = tempfile.mkstemp()
	urllib.urlretrieve(url, temp_file, reporthook=log_network_update)
	shutil.move(temp_file, save_path)



	def main():
	logging.basicConfig(level=logging.DEBUG)

	logging.info('logging in...')
	br = mechanize.Browser()
	br.open("https://class.stanford.edu/accounts/login?next=/networking/Fall2012")
	br.select_form(nr=0)
	br["username"] = raw_input('Username: ')
	br["password"] = getpass.getpass()

	response = br.submit()

	logging.info('Scrapeing page for image links')
	br.open('http://class.stanford.edu/networking/Fall2012/videos')

	for title, url in yield_videos(br.response().get_data()):

	logging.info('Found title %s with URL %s', title, url)

	filename = get_filename(url)
	download_name = make_filename(title + ' ' + filename)

	# If the file exists and is roughly the same size, don't bother downloading.
	if os.path.exists(download_name):
	local_size = os.path.getsize(download_name)
	server_size = get_url_size(url)

	if abs(local_size - server_size) < 1024:
	logging.info('File %s exists, skipping.', download_name)
	continue
	else:
	logging.info('Local file seems different than server, deleting. %s', download_name)
	os.remove(download_name)

	fetch_file(url, download_name)


	main()