Skip to content

Instantly share code, notes, and snippets.

@tokland
Last active November 24, 2016 08:14
Show Gist options
  • Save tokland/4968851 to your computer and use it in GitHub Desktop.
Save tokland/4968851 to your computer and use it in GitHub Desktop.
Download video lectures from Coursera courses.
#!/usr/bin/ruby
#
# Download lecture videos of courses from Coursera (http://www.coursera.org).
#
# Install requirements:
#
# $ gem install curb trollop nokogiri capybara ruby-progressbar
#
# Example -- Download all video lectures of courses "Calculus: Single Variable"
# and "Introduction to Astronomy":
#
# $ ruby coursera-download-videos.rb |
# --email=user@server.org --password=123456 \
# "Calculus: Single Variable" "Introduction to Astronomy"
#
# Contact: tokland@gmail.com
require 'rubygems'
require 'fileutils'
require 'logger'
require 'set'
require 'progressbar'
require 'capybara'
require 'nokogiri'
require 'trollop'
require 'curl'
### Extensions
class Hash
# Keep only given keys in hash
def slice(*keys)
keys_set = keys.to_set
select { |k, v| keys_set.include?(k) }
end
# Reverse update hash with a given default hash (performs key checking)
def defaults(default_hash)
unknown_options = self.keys - default_hash.keys
if unknown_options.empty?
replace(default_hash.merge(self))
else
raise ArgumentError.new("unknown key(s): #{unknown_options.join(', ')}")
end
end
end
class MaybeWrapper
instance_methods.each { |m| undef_method(m) unless m == :object_id || m =~ /^__/ }
def method_missing(*args, &block)
nil
end
end
class Object
def maybe(&block)
if block_given?
nil? ? nil : yield(self)
else
nil? ? MaybeWrapper.new : self
end
end
end
module Curl
def self.download_to_file(url, destination, options = {})
options.defaults(:progressbar => false, :cookies => [])
open(destination, "wb") do |fd|
curl = Curl::Easy.new(url).tap do |c|
c.follow_location = true
c.enable_cookies = true
if options[:cookies]
c.headers["Cookie"] = options[:cookies].join("; ")
end
c.on_body { |data| fd.write(data) }
end
if options[:progressbar]
title = File.basename(destination, File.extname(destination))
state = {:pbar => nil, :dl_total => nil}
curl.on_progress do |dl_total, dl_now, ul_total, ul_now|
if dl_total > 0
if !state[:pbar] || (!state[:dl_total] || dl_total > state[:pbar])
state[:dl_total] = dl_total
state[:pbar] = ProgressBar.new(title, dl_total)
state[:pbar].format_arguments =
[:title, :percentage, :bar, :stat_for_file_transfer]
end
state[:pbar].set(dl_now)
end
true
end
curl.perform
state[:pbar].finish if state[:pbar]
else
curl.perform
end
end
end
end
class CapybaraBrowser
include Capybara::DSL
def initialize(options = {})
options.defaults(:driver => :selenium, :default_wait_time => 60)
Capybara.current_driver = options.fetch(:driver)
Capybara.default_wait_time = options.fetch(:default_wait_time)
Capybara.run_server = false
end
def cookies
driver = Capybara.current_session.driver
case driver.class.name
when "Capybara::Webkit::Driver"
driver.browser.get_cookies.map { |s| s.split(";").first }
when "Capybara::Selenium::Driver"
driver.browser.manage.all_cookies.map { |c| [c[:name], c[:value]].join("=") }
else
raise ArgumentError.new("Unsupported driver: #{driver.class.name}")
end
end
end
### Application
class Coursera
attr_reader :browser, :logger
SessionError = Class.new(StandardError)
ParserError = Class.new(StandardError)
def initialize(options = {})
@browser = CapybaraBrowser.new
@logger = Logger.new(STDERR)
@logger.formatter = proc { |severity, datetime, progname, msg| "#{msg}\n" }
end
def login(email, password)
logger.debug("Login: email='#{email}' password='#{'*' * password.size}'")
browser.visit("https://www.coursera.org/account/signin")
browser.fill_in("signin-email", :with => email)
browser.fill_in("signin-password", :with => password)
browser.click_button("Sign In")
browser.find(".coursera-header-account-name")
end
def download_videos(course_name, options = {})
options.defaults(:destination_directory => nil)
browser.visit("https://www.coursera.org/")
if !browser.find(".coursera-header-account-name")
raise SessionError.new("Not logged in")
else
browser.click_link(course_name)
browser.find(:xpath,
'//*[@class="course-navbar-item"]/a[contains(@href, "lecture/index")]').click
browser.find("#spark")
get_videos_from_course_page(browser.html).map do |info|
course, section, index, lecture, url = info
directory = File.join([options[:destination_directory], course, section].compact)
path = File.join(directory, "%02d - %s.mp4" % [index+1, lecture])
logger.debug("Download video: #{path}")
safe_download(url, path, browser.cookies)
path
end
end
end
private
def get_videos_from_course_page(html)
doc = Nokogiri::HTML(html)
course_name = doc.at_css("h1").maybe.text or
raise ParserError.new("Cannot find course name")
logger.debug("Course '#{course_name}'")
# Some courses have the videos in reverse order, detect this case
# with a simple heuristics (check if the first integers in the section
# titles are in descending order).
lis = doc.css(".course-item-list-header")
ns = lis.map { |li| li.text.match(/\d+/).maybe[0].maybe.to_i }.compact
is_reversed = !ns.empty? && ns.each_cons(2).all? { |x, y| x > y }
ordered_lis = is_reversed ? lis.reverse : lis
ordered_lis.flat_map.with_index do |section, section_index|
h3 = section.at_css("h3") or
raise ParserError.new("Cannot find h3")
section_title = h3.text.gsub(/[[:space:]]+/, ' ').strip
section_name = "%02d - %s" % [section_index+1, section_title]
lecture_video_list = section.next or
raise ParserError.new("Cannt find lecture video list")
lecture_video_list.css("li").map.with_index do |lecture, index|
link = lecture.at_css("a.lecture-link") or
raise ParserError.new("Cannot find lecture link")
lecture_title = link.text.gsub("/", "-").strip
url = lecture.at_css("a[title='Video (MP4)']").maybe["href"] or
raise ParserError.new("Cannot find video link")
[course_name, section_name, index, lecture_title, url]
end
end
end
def safe_download(url, path, cookies)
unless File.exists?(path)
FileUtils.mkpath(File.dirname(path))
temp_path = path + ".partial"
begin
Curl.download_to_file(url, temp_path, :progressbar => true, :cookies => cookies)
FileUtils.copy(temp_path, path)
ensure
FileUtils.safe_unlink(temp_path)
end
end
end
end
if __FILE__ == $0
options = Trollop.options do
banner "Usage: download-coursera-videos [OPTIONS] COURSENAME [COURSENAME2 ...]"
opt :destination_directory, "Directory destination", :type => :string
opt :email, "Email", :type => :string, :required => true
opt :password, "Password", :type => :string, :required => true
end
if ARGV.empty?
Trollop.die("Specify at least one course to download")
else
coursera = Coursera.new
coursera.login(options[:email], options[:password])
download_options = options.slice(:destination_directory)
ARGV.each do |course_name|
coursera.download_videos(course_name, download_options)
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment