Created
March 13, 2012 06:59
-
-
Save npiv/2027342 to your computer and use it in GitHub Desktop.
Coursera course downloader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#------------------------------------------------------------------------------- | |
# Name: Coursera_downloader | |
# Purpose: downloader for coursera design and analysis of algorythms lecture. Requires python 2.7 | |
# | |
# Author: Alder | |
# | |
# Created: 11/03/2012 | |
# Copyright: (c) Alder 2012 | |
# Licence: free | |
#------------------------------------------------------------------------------- | |
#!/usr/bin/env python | |
#SET those values!!!!!!!!! | |
cookie_content='Copy here' | |
"""Set to content of "session" cookie from your browser. | |
Since Coursera courses are available only for logged in users, we need somehow | |
to sneak our script into logged in area. Coursera stores info in the "session" cookie. | |
So: | |
1) log in into coursera in your browser | |
2) Check cookie "session" value. | |
3) Copy this value and paste it into cookie_content variable | |
In firefox you can get it via Tools->Options->Privacy. | |
Select "Use custom settings" and press "show cookies..." button | |
Find Coursera cookies there, and copy "Content" value of "session" cookie. | |
This will be a LOOOOOONG string of symbols | |
example: cookie_content='b65ee63c972b8a2eaa3---skipped--8a2' | |
""" | |
starting_chapter=0 | |
""" | |
in case you want to start downloading not from the very beginning. 0-based index | |
""" | |
target_path="F:\Video\_Lectures\DesignAndAnalysis" | |
#that one should be pretty straightforward :) Yes, it's MS Windows path | |
import urllib2 | |
import cookielib | |
import os | |
def is_valid(x): | |
if x.isalnum(): | |
return True | |
if "()_.- ".find(x) != -1: | |
return True | |
return False | |
def create_filesystem_name(input_string): | |
"""This processes input_string so that it becomes valid directory/file | |
name | |
""" | |
return filter(is_valid,input_string) | |
def create_dir(chapter_name): | |
try: | |
os.makedirs(chapter_name) | |
except os.error as exc: | |
if (exc.errno != os.errno.EEXIST): | |
raise | |
return | |
def create_and_download_lecture(lecture_data, lecture_idx, chapter_path): | |
lecture_path = chapter_path+'\\'+create_filesystem_name('%02d-%s' % (lecture_idx,lecture_data[0])) | |
create_dir(lecture_path) | |
os.chdir(lecture_path) | |
for url in lecture_data[1:]: | |
connection =urllib2.urlopen(url) | |
meta = connection.info().getheaders("Content-Length") | |
size = "unknown"; | |
if len(meta)>0: | |
size = meta[0]; | |
meta = connection.info().getheaders("Content-Disposition") | |
if len(meta)>0: | |
file_name = create_filesystem_name(meta[0].split('filename=')[-1]) | |
else: | |
file_name = create_filesystem_name(connection.geturl().split('/')[-1]) | |
print "downloading "+url+ " size:"+size | |
data = connection.read() | |
f = open(file_name, 'wb') | |
f.write(data) | |
pass | |
return | |
def get_chapter_name(html_links_data, start_pos): | |
str='<h3 class="list_header">' | |
pos = html_links_data.find(str, start_pos) | |
if pos == -1: | |
return [-1,'']; | |
str2='</h3>' | |
pos2 = html_links_data.find(str2,pos) | |
return [pos2 + len(str2),html_links_data[pos+len(str):pos2]] | |
pass | |
def get_lecture_attachment(html_links_data, start_pos): | |
str_href = 'href="' | |
pos = html_links_data.find(str_href,start_pos) | |
if pos == -1: | |
return [-1,''] | |
pos+=len(str_href) | |
pos2 = html_links_data.find('"',pos) | |
attachment = html_links_data[pos:pos2] | |
return [pos2,attachment] | |
def get_lecture_data(html_links_data, start_pos, break_pos): | |
pos = html_links_data.find('<a', start_pos) | |
if pos > break_pos: | |
return [-1,''] | |
pos = html_links_data.find('>',pos) | |
pos2 = html_links_data.find('</a>',pos) | |
lecture_name = html_links_data[pos+1:pos2] | |
lecture_end_pos = html_links_data.find('</li>',pos2) | |
pos = pos_try = pos2 | |
result = [lecture_name] | |
while True: | |
[pos_try,attachment]=get_lecture_attachment(html_links_data,pos) | |
if (pos_try == -1) or pos_try > lecture_end_pos: | |
#no more attachments, or attachment from next lecture | |
break; | |
result=result+[attachment] | |
pos = pos_try | |
return [pos,result] | |
pass | |
def parse_and_download(html_data): | |
html_links_data = html_data.partition('<div class="item_list">')[2] | |
pos = 0 | |
chapter_idx = -1; | |
while True: | |
[pos,chapter_name] = get_chapter_name(html_links_data,pos) | |
if pos == -1: #no more chapters | |
break; | |
chapter_idx+=1; | |
if chapter_idx < starting_chapter: | |
continue #see starting_chapter description | |
chapter_path = target_path+'\\'+create_filesystem_name(chapter_name) | |
create_dir(chapter_path) | |
chapter_end_pos = html_links_data.find('</ul>',pos) | |
lecture_idx=0 | |
while True: | |
[pos_try, lecture_data] = get_lecture_data(html_links_data,pos,chapter_end_pos) | |
if pos_try == -1: #lecture from next chapter. Let's read next chapter | |
break; | |
lecture_idx+=1 | |
create_and_download_lecture(lecture_data,lecture_idx,chapter_path) | |
pos = pos_try | |
print create_filesystem_name(lecture_data[0]) | |
pass | |
def main(): | |
cj = cookielib.MozillaCookieJar() | |
ck = cookielib.Cookie(version=0, | |
name='session', | |
value=cookie_content, | |
port=None, | |
port_specified=False, | |
domain='www.coursera.org', | |
domain_specified=False, | |
domain_initial_dot=False, | |
path='/algo', | |
path_specified=True, | |
secure=False, | |
expires=None, | |
discard=False, | |
comment=None, | |
comment_url=None, | |
rest={'HttpOnly': None}, | |
rfc2109=False) | |
cj.set_cookie(ck) | |
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) | |
urllib2.install_opener(opener) | |
connection=urllib2.urlopen('https://www.coursera.org/algo/lecture/index') | |
html_data = connection.read() | |
parse_and_download(html_data) | |
pass | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment