Created
March 17, 2013 19:54
-
-
Save prwteas/5183341 to your computer and use it in GitHub Desktop.
Diff file for the proposed work around of coursera-dl. Originally posted by Suresh Jayanty (jetume)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/coursera/coursera_dl.py b/coursera/coursera_dl.py | |
index bba5dcc..dc23175 100755 | |
--- a/coursera/coursera_dl.py | |
+++ b/coursera/coursera_dl.py | |
@@ -36,12 +36,16 @@ import tempfile | |
import time | |
import urllib | |
import urllib2 | |
+import pdb | |
try: | |
from BeautifulSoup import BeautifulSoup | |
except ImportError: | |
from bs4 import BeautifulSoup | |
+csrftoken = '' | |
+session = '' | |
+ | |
class ClassNotFoundException(BaseException): | |
""" | |
@@ -99,48 +103,96 @@ def get_auth_url(className): | |
Return the URL for authentication of the class given by className. | |
""" | |
- return 'http://class.coursera.org/%s/auth/auth_redirector?type=login&subtype=normal&email=&visiting=&minimal=true' \ | |
+ return 'https://class.coursera.org/%s/auth/auth_redirector?type=login&subtype=normal&email=&visiting=&minimal=true' \ | |
% className | |
+def get_new_auth_url(): | |
+ return 'https://www.coursera.org/maestro/api/user/login' | |
def get_syllabus_url(className): | |
""" | |
Return the Coursera index/syllabus URL. | |
""" | |
- return 'http://class.coursera.org/%s/lecture/index' % className | |
+ return 'https://class.coursera.org/%s/lecture/index' % className | |
def write_cookie_file(className, username, password): | |
""" | |
Automatically generate a cookie file for the coursera site. | |
""" | |
- | |
try: | |
+ global csrftoken | |
+ global session | |
hn, fn = tempfile.mkstemp() | |
+ cookies = cookielib.LWPCookieJar() | |
+ handlers = [ | |
+ urllib2.HTTPHandler(), | |
+ urllib2.HTTPSHandler(), | |
+ urllib2.HTTPCookieProcessor(cookies) | |
+ ] | |
+ opener = urllib2.build_opener(*handlers) | |
+ | |
+ req = urllib2.Request(get_syllabus_url(className)) | |
+ res = opener.open(req) | |
+ | |
+ for cookie in cookies: | |
+ if cookie.name == 'csrf_token': | |
+ csrftoken = cookie.value | |
+ break | |
+ opener.close() | |
+ | |
+ # Now make a call to the authenticator url: | |
cj = cookielib.MozillaCookieJar(fn) | |
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj), | |
- urllib2.HTTPHandler()) | |
+ urllib2.HTTPHandler(), | |
+ urllib2.HTTPSHandler()) | |
- req = urllib2.Request(get_auth_url(className)) | |
- ref = opener.open(req).geturl() | |
+ opener.addheaders.append(('Cookie', 'csrftoken=%s' % csrftoken)) | |
+ opener.addheaders.append(('Referer', 'https://www.coursera.org')) | |
+ opener.addheaders.append(('X-CSRFToken', csrftoken)) | |
+ req = urllib2.Request(get_new_auth_url()) | |
- data = urllib.urlencode({'email': username, | |
- 'password': password, | |
- 'login': 'Login'}) | |
- req = urllib2.Request(ref, data) | |
+ data = urllib.urlencode({'email_address': username,'password': password}) | |
+ req.add_data(data) | |
opener.open(req) | |
except Exception as e: | |
if '404' in str(e): | |
raise ClassNotFoundException(className) | |
- | |
cj.save() | |
opener.close() | |
os.close(hn) | |
- | |
return fn | |
+def down_the_wabbit_hole(className, cookies_file): | |
+ """ | |
+ Try to handle the 4 step redirect before getting to the course's index page | |
+ """ | |
+ auth_redirector_url = str('https://class.coursera.org/%s/auth/auth_redirector?type=login&subtype=normal&email=&visiting=%s' % (className, urllib.quote_plus(get_syllabus_url(className)))) | |
+ | |
+ global session | |
+ cj = get_cookie_jar(cookies_file) | |
+ | |
+ opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj), urllib2.HTTPHandler(), | |
+ urllib2.HTTPSHandler()) | |
+ | |
+ req = urllib2.Request(auth_redirector_url) | |
+ opener.open(req) | |
+ | |
+ for cookie in cj: | |
+ if cookie.name == 'session': | |
+ session = cookie.value | |
+ break | |
+ opener.close() | |
+ | |
+ opener = urllib2.build_opener(urllib2.HTTPHandler(), urllib2.HTTPSHandler()) | |
+ req = urllib2.Request(get_syllabus_url(className)) | |
+ | |
+ opener.addheaders.append(('Cookie', 'csrf_token=%s;session=%s' % (csrftoken, session))) | |
+ opener.open(req) | |
+ | |
+ opener.close() | |
def get_netrc_path(path=None): | |
""" | |
@@ -175,21 +227,26 @@ def load_cookies_file(cookies_file): | |
cookies.seek(0) | |
return cookies | |
- | |
-def get_opener(cookies_file): | |
- """ | |
- Use cookie file to create a url opener. | |
- """ | |
- | |
+def get_cookie_jar(cookies_file): | |
cj = cookielib.MozillaCookieJar() | |
cookies = load_cookies_file(cookies_file) | |
# nasty hack: cj.load() requires a filename not a file, but if I use | |
# stringio, that file doesn't exist. I used NamedTemporaryFile before, | |
# but encountered problems on Windows. | |
- | |
cj._really_load(cookies, 'StringIO.cookies', False, False) | |
- return urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) | |
+ | |
+ return cj | |
+ | |
+def get_opener(cookies_file): | |
+ """ | |
+ Use cookie file to create a url opener. | |
+ """ | |
+ | |
+ cj = get_cookie_jar(cookies_file) | |
+ | |
+ return urllib2.build_opener(urllib2.HTTPCookieProcessor(cj), urllib2.HTTPHandler(), | |
+ urllib2.HTTPSHandler()) | |
def get_page(url, cookies_file): | |
@@ -197,8 +254,14 @@ def get_page(url, cookies_file): | |
Download an HTML page using the cookiejar. | |
""" | |
- opener = get_opener(cookies_file) | |
- ret = opener.open(url).read() | |
+ opener = urllib2.build_opener(urllib2.HTTPHandler(), urllib2.HTTPSHandler()) | |
+ req = urllib2.Request(url) | |
+ | |
+ opener.addheaders.append(('Cookie', 'csrf_token=%s;session=%s' % (csrftoken, session))) | |
+ ret = opener.open(req).read() | |
+ | |
+ # opener = get_opener(cookies_file) | |
+ # ret = opener.open(url).read() | |
opener.close() | |
return ret | |
@@ -222,6 +285,7 @@ def get_syllabus(class_name, cookies_file, local_page=False): | |
if not (local_page and os.path.exists(local_page)): | |
url = get_syllabus_url(class_name) | |
+ down_the_wabbit_hole(class_name, cookies_file) | |
page = get_page(url, cookies_file) | |
logging.info('Downloaded %s (%d bytes)', url, len(page)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment