Skip to content

Instantly share code, notes, and snippets.

@npiv
Created March 13, 2012 06:59
Show Gist options
  • Save npiv/2027342 to your computer and use it in GitHub Desktop.
Save npiv/2027342 to your computer and use it in GitHub Desktop.
Coursera course downloader
#-------------------------------------------------------------------------------
# Name: Coursera_downloader
# Purpose: downloader for coursera design and analysis of algorythms lecture. Requires python 2.7
#
# Author: Alder
#
# Created: 11/03/2012
# Copyright: (c) Alder 2012
# Licence: free
#-------------------------------------------------------------------------------
#!/usr/bin/env python
#SET those values!!!!!!!!!
cookie_content='Copy here'
"""Set to content of "session" cookie from your browser.
Since Coursera courses are available only for logged in users, we need somehow
to sneak our script into logged in area. Coursera stores info in the "session" cookie.
So:
1) log in into coursera in your browser
2) Check cookie "session" value.
3) Copy this value and paste it into cookie_content variable
In firefox you can get it via Tools->Options->Privacy.
Select "Use custom settings" and press "show cookies..." button
Find Coursera cookies there, and copy "Content" value of "session" cookie.
This will be a LOOOOOONG string of symbols
example: cookie_content='b65ee63c972b8a2eaa3---skipped--8a2'
"""
starting_chapter=0
"""
in case you want to start downloading not from the very beginning. 0-based index
"""
target_path="F:\Video\_Lectures\DesignAndAnalysis"
#that one should be pretty straightforward :) Yes, it's MS Windows path
import urllib2
import cookielib
import os
def is_valid(x):
if x.isalnum():
return True
if "()_.- ".find(x) != -1:
return True
return False
def create_filesystem_name(input_string):
"""This processes input_string so that it becomes valid directory/file
name
"""
return filter(is_valid,input_string)
def create_dir(chapter_name):
try:
os.makedirs(chapter_name)
except os.error as exc:
if (exc.errno != os.errno.EEXIST):
raise
return
def create_and_download_lecture(lecture_data, lecture_idx, chapter_path):
lecture_path = chapter_path+'\\'+create_filesystem_name('%02d-%s' % (lecture_idx,lecture_data[0]))
create_dir(lecture_path)
os.chdir(lecture_path)
for url in lecture_data[1:]:
connection =urllib2.urlopen(url)
meta = connection.info().getheaders("Content-Length")
size = "unknown";
if len(meta)>0:
size = meta[0];
meta = connection.info().getheaders("Content-Disposition")
if len(meta)>0:
file_name = create_filesystem_name(meta[0].split('filename=')[-1])
else:
file_name = create_filesystem_name(connection.geturl().split('/')[-1])
print "downloading "+url+ " size:"+size
data = connection.read()
f = open(file_name, 'wb')
f.write(data)
pass
return
def get_chapter_name(html_links_data, start_pos):
str='<h3 class="list_header">'
pos = html_links_data.find(str, start_pos)
if pos == -1:
return [-1,''];
str2='</h3>'
pos2 = html_links_data.find(str2,pos)
return [pos2 + len(str2),html_links_data[pos+len(str):pos2]]
pass
def get_lecture_attachment(html_links_data, start_pos):
str_href = 'href="'
pos = html_links_data.find(str_href,start_pos)
if pos == -1:
return [-1,'']
pos+=len(str_href)
pos2 = html_links_data.find('"',pos)
attachment = html_links_data[pos:pos2]
return [pos2,attachment]
def get_lecture_data(html_links_data, start_pos, break_pos):
pos = html_links_data.find('<a', start_pos)
if pos > break_pos:
return [-1,'']
pos = html_links_data.find('>',pos)
pos2 = html_links_data.find('</a>',pos)
lecture_name = html_links_data[pos+1:pos2]
lecture_end_pos = html_links_data.find('</li>',pos2)
pos = pos_try = pos2
result = [lecture_name]
while True:
[pos_try,attachment]=get_lecture_attachment(html_links_data,pos)
if (pos_try == -1) or pos_try > lecture_end_pos:
#no more attachments, or attachment from next lecture
break;
result=result+[attachment]
pos = pos_try
return [pos,result]
pass
def parse_and_download(html_data):
html_links_data = html_data.partition('<div class="item_list">')[2]
pos = 0
chapter_idx = -1;
while True:
[pos,chapter_name] = get_chapter_name(html_links_data,pos)
if pos == -1: #no more chapters
break;
chapter_idx+=1;
if chapter_idx < starting_chapter:
continue #see starting_chapter description
chapter_path = target_path+'\\'+create_filesystem_name(chapter_name)
create_dir(chapter_path)
chapter_end_pos = html_links_data.find('</ul>',pos)
lecture_idx=0
while True:
[pos_try, lecture_data] = get_lecture_data(html_links_data,pos,chapter_end_pos)
if pos_try == -1: #lecture from next chapter. Let's read next chapter
break;
lecture_idx+=1
create_and_download_lecture(lecture_data,lecture_idx,chapter_path)
pos = pos_try
print create_filesystem_name(lecture_data[0])
pass
def main():
cj = cookielib.MozillaCookieJar()
ck = cookielib.Cookie(version=0,
name='session',
value=cookie_content,
port=None,
port_specified=False,
domain='www.coursera.org',
domain_specified=False,
domain_initial_dot=False,
path='/algo',
path_specified=True,
secure=False,
expires=None,
discard=False,
comment=None,
comment_url=None,
rest={'HttpOnly': None},
rfc2109=False)
cj.set_cookie(ck)
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)
connection=urllib2.urlopen('https://www.coursera.org/algo/lecture/index')
html_data = connection.read()
parse_and_download(html_data)
pass
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment