npiv/gist:2027342

## gistfile1.py
#-------------------------------------------------------------------------------
# Name:    	Coursera_downloader
# Purpose: 	downloader for coursera design and analysis of algorythms lecture. Requires python 2.7
#
# Author:  	Alder
#
# Created: 	11/03/2012
# Copyright:   (c) Alder 2012
# Licence: 	free
#-------------------------------------------------------------------------------
#!/usr/bin/env python

#SET those values!!!!!!!!!
cookie_content='Copy here'
"""Set to content of "session" cookie from your browser.
Since Coursera courses are available only for logged in users, we need somehow
to sneak our script into logged in area. Coursera stores info in the "session" cookie.
So:
	1) log in into coursera in your browser
	2) Check cookie "session" value.
	3) Copy this value and paste it into cookie_content variable

In firefox you can get it via Tools->Options->Privacy.
Select "Use custom settings" and press "show cookies..." button
Find Coursera cookies there, and copy "Content" value of "session" cookie.
This will be a LOOOOOONG string of symbols
example: cookie_content='b65ee63c972b8a2eaa3---skipped--8a2'
"""

starting_chapter=0
"""
in case you want to start downloading not from the very beginning. 0-based index
"""

target_path="F:\Video\_Lectures\DesignAndAnalysis"
#that one should be pretty straightforward :) Yes, it's MS Windows path

import urllib2
import cookielib
import os

def is_valid(x):
	if x.isalnum():
    	return True
	if "()_.- ".find(x) != -1:
    	return True
	return False

def create_filesystem_name(input_string):
	"""This processes input_string so that it becomes valid directory/file
	name
	"""
	return filter(is_valid,input_string)

def create_dir(chapter_name):
	try:
    	os.makedirs(chapter_name)
	except os.error as exc:
    	if (exc.errno != os.errno.EEXIST):
        	raise
	return

def create_and_download_lecture(lecture_data, lecture_idx, chapter_path):
	lecture_path = chapter_path+'\\'+create_filesystem_name('%02d-%s' % (lecture_idx,lecture_data[0]))
	create_dir(lecture_path)
	os.chdir(lecture_path)
	for url in lecture_data[1:]:
    	connection =urllib2.urlopen(url)
    	meta = connection.info().getheaders("Content-Length")
    	size = "unknown";
    	if len(meta)>0:
        	size = meta[0];
    	meta = connection.info().getheaders("Content-Disposition")
    	if len(meta)>0:
        	file_name = create_filesystem_name(meta[0].split('filename=')[-1])
    	else:
        	file_name = create_filesystem_name(connection.geturl().split('/')[-1])
    	print "downloading  "+url+ " size:"+size
    	data = connection.read()
    	f = open(file_name, 'wb')
    	f.write(data)
    	pass

	return

def get_chapter_name(html_links_data, start_pos):
	str='<h3 class="list_header">'
	pos = html_links_data.find(str, start_pos)
	if pos == -1:
    	return [-1,''];
	str2='</h3>'
	pos2 = html_links_data.find(str2,pos)
	return [pos2 + len(str2),html_links_data[pos+len(str):pos2]]
	pass

def get_lecture_attachment(html_links_data, start_pos):
	str_href = 'href="'
	pos = html_links_data.find(str_href,start_pos)
	if pos == -1:
    	return [-1,'']
	pos+=len(str_href)
	pos2 = html_links_data.find('"',pos)
	attachment = html_links_data[pos:pos2]
	return [pos2,attachment]


def get_lecture_data(html_links_data, start_pos, break_pos):
	pos = html_links_data.find('<a', start_pos)
	if pos > break_pos:
    	return [-1,'']
	pos = html_links_data.find('>',pos)
	pos2 = html_links_data.find('</a>',pos)
	lecture_name = html_links_data[pos+1:pos2]
	lecture_end_pos = html_links_data.find('</li>',pos2)
	pos = pos_try = pos2
	result = [lecture_name]
	while True:
    	[pos_try,attachment]=get_lecture_attachment(html_links_data,pos)
    	if (pos_try == -1) or pos_try > lecture_end_pos:
        	#no more attachments, or attachment from next lecture
        	break;
    	result=result+[attachment]
    	pos = pos_try
	return [pos,result]
	pass

def parse_and_download(html_data):
	html_links_data = html_data.partition('<div class="item_list">')[2]
	pos = 0
	chapter_idx = -1;
	while True:
    	[pos,chapter_name] = get_chapter_name(html_links_data,pos)
    	if pos == -1: #no more chapters
        	break;
    	chapter_idx+=1;
    	if chapter_idx < starting_chapter:
        	continue #see starting_chapter description
    	chapter_path = target_path+'\\'+create_filesystem_name(chapter_name)
    	create_dir(chapter_path)
    	chapter_end_pos = html_links_data.find('</ul>',pos)
    	lecture_idx=0
    	while True:
        	[pos_try, lecture_data] = get_lecture_data(html_links_data,pos,chapter_end_pos)
        	if pos_try == -1: #lecture from next chapter. Let's read next chapter
            	break;
        	lecture_idx+=1
        	create_and_download_lecture(lecture_data,lecture_idx,chapter_path)
        	pos = pos_try
        	print create_filesystem_name(lecture_data[0])
	pass

def main():
	cj = cookielib.MozillaCookieJar()
	ck = cookielib.Cookie(version=0,
    	name='session',
    	value=cookie_content,
    	port=None,
    	port_specified=False,
    	domain='www.coursera.org',
    	domain_specified=False,
    	domain_initial_dot=False,
    	path='/algo',
    	path_specified=True,
    	secure=False,
    	expires=None,
    	discard=False,
    	comment=None,
    	comment_url=None,
    	rest={'HttpOnly': None},
    	rfc2109=False)
	cj.set_cookie(ck)
	opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
	urllib2.install_opener(opener)
	connection=urllib2.urlopen('https://www.coursera.org/algo/lecture/index')
	html_data = connection.read()
	parse_and_download(html_data)
	pass

if __name__ == '__main__':
	main()
	#-------------------------------------------------------------------------------
	# Name: Coursera_downloader
	# Purpose: downloader for coursera design and analysis of algorythms lecture. Requires python 2.7
	#
	# Author: Alder
	#
	# Created: 11/03/2012
	# Copyright: (c) Alder 2012
	# Licence: free
	#-------------------------------------------------------------------------------
	#!/usr/bin/env python

	#SET those values!!!!!!!!!
	cookie_content='Copy here'
	"""Set to content of "session" cookie from your browser.
	Since Coursera courses are available only for logged in users, we need somehow
	to sneak our script into logged in area. Coursera stores info in the "session" cookie.
	So:
	1) log in into coursera in your browser
	2) Check cookie "session" value.
	3) Copy this value and paste it into cookie_content variable

	In firefox you can get it via Tools->Options->Privacy.
	Select "Use custom settings" and press "show cookies..." button
	Find Coursera cookies there, and copy "Content" value of "session" cookie.
	This will be a LOOOOOONG string of symbols
	example: cookie_content='b65ee63c972b8a2eaa3---skipped--8a2'
	"""

	starting_chapter=0
	"""
	in case you want to start downloading not from the very beginning. 0-based index
	"""

	target_path="F:\Video\_Lectures\DesignAndAnalysis"
	#that one should be pretty straightforward :) Yes, it's MS Windows path

	import urllib2
	import cookielib
	import os

	def is_valid(x):
	if x.isalnum():
	return True
	if "()_.- ".find(x) != -1:
	return True
	return False

	def create_filesystem_name(input_string):
	"""This processes input_string so that it becomes valid directory/file
	name
	"""
	return filter(is_valid,input_string)

	def create_dir(chapter_name):
	try:
	os.makedirs(chapter_name)
	except os.error as exc:
	if (exc.errno != os.errno.EEXIST):
	raise
	return

	def create_and_download_lecture(lecture_data, lecture_idx, chapter_path):
	lecture_path = chapter_path+'\\'+create_filesystem_name('%02d-%s' % (lecture_idx,lecture_data[0]))
	create_dir(lecture_path)
	os.chdir(lecture_path)
	for url in lecture_data[1:]:
	connection =urllib2.urlopen(url)
	meta = connection.info().getheaders("Content-Length")
	size = "unknown";
	if len(meta)>0:
	size = meta[0];
	meta = connection.info().getheaders("Content-Disposition")
	if len(meta)>0:
	file_name = create_filesystem_name(meta[0].split('filename=')[-1])
	else:
	file_name = create_filesystem_name(connection.geturl().split('/')[-1])
	print "downloading "+url+ " size:"+size
	data = connection.read()
	f = open(file_name, 'wb')
	f.write(data)
	pass

	return

	def get_chapter_name(html_links_data, start_pos):
	str='<h3 class="list_header">'
	pos = html_links_data.find(str, start_pos)
	if pos == -1:
	return [-1,''];
	str2='</h3>'
	pos2 = html_links_data.find(str2,pos)
	return [pos2 + len(str2),html_links_data[pos+len(str):pos2]]
	pass

	def get_lecture_attachment(html_links_data, start_pos):
	str_href = 'href="'
	pos = html_links_data.find(str_href,start_pos)
	if pos == -1:
	return [-1,'']
	pos+=len(str_href)
	pos2 = html_links_data.find('"',pos)
	attachment = html_links_data[pos:pos2]
	return [pos2,attachment]


	def get_lecture_data(html_links_data, start_pos, break_pos):
	pos = html_links_data.find('<a', start_pos)
	if pos > break_pos:
	return [-1,'']
	pos = html_links_data.find('>',pos)
	pos2 = html_links_data.find('</a>',pos)
	lecture_name = html_links_data[pos+1:pos2]
	lecture_end_pos = html_links_data.find('</li>',pos2)
	pos = pos_try = pos2
	result = [lecture_name]
	while True:
	[pos_try,attachment]=get_lecture_attachment(html_links_data,pos)
	if (pos_try == -1) or pos_try > lecture_end_pos:
	#no more attachments, or attachment from next lecture
	break;
	result=result+[attachment]
	pos = pos_try
	return [pos,result]
	pass

	def parse_and_download(html_data):
	html_links_data = html_data.partition('<div class="item_list">')[2]
	pos = 0
	chapter_idx = -1;
	while True:
	[pos,chapter_name] = get_chapter_name(html_links_data,pos)
	if pos == -1: #no more chapters
	break;
	chapter_idx+=1;
	if chapter_idx < starting_chapter:
	continue #see starting_chapter description
	chapter_path = target_path+'\\'+create_filesystem_name(chapter_name)
	create_dir(chapter_path)
	chapter_end_pos = html_links_data.find('</ul>',pos)
	lecture_idx=0
	while True:
	[pos_try, lecture_data] = get_lecture_data(html_links_data,pos,chapter_end_pos)
	if pos_try == -1: #lecture from next chapter. Let's read next chapter
	break;
	lecture_idx+=1
	create_and_download_lecture(lecture_data,lecture_idx,chapter_path)
	pos = pos_try
	print create_filesystem_name(lecture_data[0])
	pass

	def main():
	cj = cookielib.MozillaCookieJar()
	ck = cookielib.Cookie(version=0,
	name='session',
	value=cookie_content,
	port=None,
	port_specified=False,
	domain='www.coursera.org',
	domain_specified=False,
	domain_initial_dot=False,
	path='/algo',
	path_specified=True,
	secure=False,
	expires=None,
	discard=False,
	comment=None,
	comment_url=None,
	rest={'HttpOnly': None},
	rfc2109=False)
	cj.set_cookie(ck)
	opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
	urllib2.install_opener(opener)
	connection=urllib2.urlopen('https://www.coursera.org/algo/lecture/index')
	html_data = connection.read()
	parse_and_download(html_data)
	pass

	if __name__ == '__main__':
	main()