Skip to content

Instantly share code, notes, and snippets.

@lamberta
Last active June 16, 2018 13:34
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save lamberta/6531309 to your computer and use it in GitHub Desktop.
Save lamberta/6531309 to your computer and use it in GitHub Desktop.
Download all the lecture videos from a Coursera class index page. Requires phantomjs and wget.
#!/usr/bin/env phantomjs
/**
* Scrapes a Coursera class index page and downloads the
* individual lecture mp4s. Requires phantomjs and curl.
* Usage: ./coursera-slurp [index-url]
*
* If something breaks, it's probably the DOM selector in the
* remote page. Try fiddling with the 'linkSelector' function.
*/
var page = require('webpage').create(),
spawn = require("child_process").spawn,
args = require('system').args,
opts = parseOpts(args),
userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36",
max_proc = 5, //max concurrent child processes, 0 for unlimited
return_tally = 0, //keep track of returned "threads"
linkqueue; //collection of link objects
if (opts.help) {
console.log("Usage: coursera-slurp [options] index-url");
console.log("Options:");
console.log(" -h, --help Print help");
console.log(" -v, --verbose Print verbose output");
console.log(" -D, --dry-run Display remote files found but don't download");
console.log(" -A, --auth=token Supply login token, this is the 'CAUTH' cookie");
console.log(" -C, --cookies=string Use additional cookies");
phantom.exit(0);
}
if (opts.cookies) {
for (var name in opts.cookies) {
phantom.addCookie({
'name': name, //required
'value': opts.cookies[name], //required
'domain': '.coursera.org' //required
});
}
}
page.settings.userAgent = userAgent;
if (opts.verbose) {
console.log("Downloading page: " + opts.url);
}
page.open(opts.url, function (status) {
if (status !== 'success') {
console.warn('Failed to load url, aborting.');
phantom.exit(1);
}
linkqueue = page.evaluate(linkSelector).concat();
if (linkqueue.length === 0) {
console.error("No files found!");
phantom.exit(1);
}
if (opts.verbose || opts.dryrun) {
console.log("Found files:");
linkqueue.forEach(function (link) {
var filename = getLinkFilename(link),
downloadurl = getLinkURL(link);
console.log(downloadurl + " => " + filename);
});
}
if (opts.dry_run) {
phantom.exit(0);
}
//off we go ...
if (max_proc === 0) {
max_proc = linkqueue.length;
}
for (var i = 0; i < max_proc; i++) {
if (linkqueue.length > 0) {
var link = linkqueue.shift();
window.setTimeout(downloadLink, 0, link);
}
}
});
page.onError = function (msg, trace) {
console.error("Error in remote page:\n\t", msg);
};
/* Evaluated in webpage, limited cross-polination with phantomjs.
* @return {Array} Links in order: [link1, link2, ...]
* A link object looks like: {id: 'lectureid', title: 'lecture title', pos: idx}
*/
function linkSelector () {
var link_elems = document.querySelectorAll('.lecture-link'),
links = [];
Array.prototype.slice.call(link_elems).forEach(function (elem, i) {
links.push({
id: elem.getAttribute('data-lecture-id'), //url-id
title: elem.innerText.trim(), //link title
pos: i //position in page
});
});
return links;
}
/* Downloads the given link.
* On finish, download another or close application.
*/
function downloadLink (link) {
var filename = getLinkFilename(link),
downloadurl = getLinkURL(link),
curlOpts = ['--location', '--user-agent', userAgent, downloadurl, '--output', filename];
//add cookies
if (Array.isArray(phantom.cookies) && phantom.cookies.length > 0) {
var cookieStr = '';
phantom.cookies.forEach(function (cookie, i) {
if (i !== 0) { cookieStr += "; "; }
cookieStr += (cookie.name + "=" + cookie.value);
});
curlOpts.unshift('--cookie', cookieStr);
}
if (opts.verbose) {
curlOpts.unshift('--verbose');
console.log("curl options: " + curlOpts.join(' '));
}
//download
var proc = spawn('curl', curlOpts);
console.log("Downloading " + filename);
//on finish, use this "thread" to download the next link or ring finished bell
proc.on('exit', function (status) {
if (opts.verbose) {
console.log("Download finished for " + filename + ", status: " + status);
}
if (linkqueue.length > 0) {
var nextlink = linkqueue.shift();
downloadLink(nextlink);
} else {
return_tally += 1;
if (return_tally === max_proc) {
console.log("All downloads complete!");
phantom.exit();
}
}
});
if (opts.verbose) {
var logOut = function (data) { console.log(data); };
proc.stdout.on('data', logOut);
proc.stderr.on('data', logOut);
}
}
/* Construct download link from Link id.
* @return {string} URL of file download.
*/
function getLinkURL (link) {
var linkPostfix = "/download.mp4?lecture_id=",
url = opts.url,
base = (url[url.length-1] === '/') ? url.slice(0, -1) : url;
return (base + linkPostfix + link.id);
}
/* Create a output filename based using Link title.
* @return {string} Local filename, in form: '001-First-Lecture.mp4'
*/
function getLinkFilename (link) {
var base = link.title.replace(/\s/g, '-').replace(/[^\w\n-]/g, ''),
ext = "mp4",
filename = base + '.' + ext;
if (typeof link.pos === 'number') {
var i = link.pos + 1,
pre = (i < 100) ? ("00" + i).slice(-3) : i; //pad
filename = (pre + '-' + filename);
}
return filename;
}
/* Parse command-line options.
*/
function parseOpts (args) {
var opts = {
url: args[args.length-1] //url should be last
};
if (args.length < 2) {
opts.help = true;
} else {
for (var i = 1, len = args.length; i < len; i++) {
switch (args[i]) {
case '-h': case '--help':
opts.help = true;
break;
case '-D': case '--dry-run':
opts.dry_run = true;
break;
case '-C': case '--cookies':
if (!opts.cookies) { opts.cookies = {}; }
//parse cookie string
args[i+1].split(';').forEach(function (entry) {
var keyval = entry.trim().split('=');
opts.cookies[keyval[0]] = keyval[1];
});
break;
case '-A': case '--auth':
if (!opts.cookies) { opts.cookies = {}; }
opts.cookies = {'CAUTH': args[i+1]}
break;
case '-v': case '--verbose':
opts.verbose = true;
break;
}
}
}
return opts;
}
@AmarPrabhu
Copy link

I tried this script using phantomjs coursera-slurp.js --auth=myCAUTHkey https://class.coursera.org/algs4partII-003/lecture and it returns no files. However executing linkSelector() in browser console returns 12 links. My Phantomjs version is 1.9.7. What am I missing here? Any help would be great.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment