Skip to content

Instantly share code, notes, and snippets.

@0x4E69676874466F78
Forked from archagon/gdc-downloader.py
Created May 17, 2013 13:39
Show Gist options
  • Save 0x4E69676874466F78/5599085 to your computer and use it in GitHub Desktop.
Save 0x4E69676874466F78/5599085 to your computer and use it in GitHub Desktop.
# GDC Vault videos can't be watched on mobile devices and this is a very sad thing indeed!
# This script is designed to circumvent this by downloading the lecture and slideshow
# videos which can then be re-encoded into whatever format you wish.
# Note: this code is rather flimsy and was written as fast as possible for my own personal use.
# The code only works for the most recent GDC Vault videos, since they all use the same player
# format. If the XML format used to run the player is changed (as it has in the past), the code
# will have to be reconfigured. In the past, I was able to feed a wget-compatible cookies.txt
# file into the wget call, but I can't get it to trigger anymore. So for now, the way I download
# each video is I look at the source for the player page, find the player.html URL, and feed
# it into the script with the -f flag. Ugly and slow, but hey, it works.
# I generally hate reinventing the wheel and it does look like youtube-dl does some of the same
# stuff I'm doing, but I couldn't get it to work with the GDC URLs. So off to Python land we go!!!
# Usage is as follows:
#
# With cookies.txt: gdc-downloader.py "[GDC video URL]" [output dir]
# Without cookies.txt: gdc-downloader.py -f "[GDC player.html URL]" [output dir]
#
# A GDC video URL looks like this:
# http://www.gdcvault.com/play/1015662/Creative-Panic-How-Agility-Turned
#
# A GDC player.html URL looks like this:
# http://evt.dispeak.com/ubm/gdc/sf12/player.html?xmlURL=xml/201203238_1331124629609NXXJ.xml&token=1234567890
#
# The output dir should be the name of your video. For example, suppling TestDir/GDCVid will create
# TestDir/GDCVid/GDCVid.xml, TestDir/GDCVid/GDCVid-slide.flv, etc.
# You need to have wget and rtmpdump installed in order for this script to work. I recommend macports.
#############
# Constants #
#############
cookies_filename = "cookies.txt"
player_regular_expression = r"^.*\"(.*?)(player\.html)(.*?xmlURL=(.*?)[&].*?)\".*$"
player_regular_expression_force = r"^(.*?)(player\.html)(.*?xmlURL=(.*?)[&].*?)$" # same as above but parses URL directly
login_regular_expression = r"^.*\"(.*?login\.php.*?)\".*$"
swf_name_regular_expression = r"^.*embed the Flash Content SWF when all tests are passed.*?\"src\".*?\"(.*?)\".*$"
# DEPRECATED: This URL was retrieved from the player SWF, and may change in the future.
# rtmp_url = "rtmp://fms.digitallyspeaking.com/cfx/st/ondemand/fcs/ident"
########
# Code #
########
import sys
import os
import subprocess
import re
from xml.dom import minidom
def error(message):
print "[gdc-downloader] Error: " + message
sys.exit(1)
def message(msg):
print "[gdc-downloader] Message: " + msg
def check_dependencies(force):
if not force:
f = None
try:
f = open(cookies_filename)
except Exception, e:
error("cookies not found in " + cookies_filename)
f.close()
# TODO: check wget, rtmpdump
def dump_to_file(data, dest):
dest_dir = os.path.abspath(os.path.split(dest)[0])
if not os.path.exists(dest_dir):
os.makedirs(dest_dir)
debug_file = open(dest, "w")
debug_file.write(data)
debug_file.close()
def download_url(url):
args = ["wget", "-qO-", "--load-cookies", cookies_filename, url]
try:
retval = subprocess.Popen(args, stdout=subprocess.PIPE)
except Exception, e:
error("wget error with url " + url)
out = retval.communicate()
return out[0]
def retrieve_data_from_base_url(url, force, xml_dest):
if not force: # I haven't tested this code in a while, so it might not work
html = download_url(url).replace('\n', '').replace('\r', '')
login_regex = re.compile(login_regular_expression)
login_results = login_regex.match(html)
if (login_results):
error("downloaded login page -- check your cookies")
player_regex = re.compile(player_regular_expression)
player_results = player_regex.match(html)
else:
html = url
player_regex = re.compile(player_regular_expression_force)
player_results = player_regex.match(html)
if not player_results:
error("player URL not found")
dump_to_file(html, xml_dest)
base_url = player_results.group(1)
player_url = player_results.group(2)
player_arguments = player_results.group(3)
xml_url = player_results.group(4)
message("player url is " + base_url + player_url)
message("player arguments are " + player_arguments)
message("xml url is " + base_url + xml_url)
player_html = download_url(base_url + player_url).replace('\n', '').replace('\r', '')
swf_name_regex = re.compile(swf_name_regular_expression)
swf_name_results = swf_name_regex.match(player_html)
if not swf_name_results:
error("SWF URL not found")
swf_url = base_url + swf_name_results.group(1) + ".swf"
message("swf url is " + swf_url)
data = {}
data["player_url"] = base_url + player_url + player_arguments
data["swf_url"] = swf_url
data["xml_url"] = base_url + xml_url
return data
def parse_xml_from_url(url, xml_dest):
xml = download_url(url)
dump_to_file(xml, xml_dest)
parsed_xml = minidom.parseString(xml)
akamai_host_xml = parsed_xml.getElementsByTagName("akamaiHost")
speaker_video_xml = parsed_xml.getElementsByTagName("speakerVideo")
slide_video_xml = parsed_xml.getElementsByTagName("slideVideo")
if not len(akamai_host_xml) or not len(speaker_video_xml) or not len(slide_video_xml):
error("xml missing properties")
akamai_host = "rtmp://" + akamai_host_xml[0].firstChild.nodeValue + "/fcs/ident"
speaker_video = speaker_video_xml[0].firstChild.nodeValue.replace(".flv", "")
slide_video = slide_video_xml[0].firstChild.nodeValue.replace(".flv", "")
message("akamai host is " + akamai_host)
message("speaker video is " + speaker_video)
message("slide video is " + slide_video)
# some of the xml files contain exta audio tracks; we want those, don't we?
audios = parsed_xml.getElementsByTagName("audios")
audio_metadata = {}
if (audios):
for audio_node in audios[0].getElementsByTagName("audio"):
audio_url = None
code = None
for (name, value) in audio_node.attributes.items():
if name == "url":
audio_url = value.replace(".flv", "")
elif name == "code":
code = value
if code:
audio_metadata[code] = audio_url
message("audio " + code + " is " + audio_url)
data = {}
data["akamai"] = akamai_host
data["speaker"] = speaker_video
data["slide"] = slide_video
data["audio"] = audio_metadata
return data
def download_video(rtmp, playpath, swf_url, page_url, filename):
args = ["rtmpdump", "--rtmp", rtmp, "--playpath", playpath, "--swfUrl", swf_url, "--pageUrl", page_url, "--flv", filename]
try:
retval = subprocess.call(args, stdin=None)
except Exception, e:
error("rtmpdump error")
return None
def download_gdc_video_at_url(url, dest="", force=False):
dest_path = os.path.abspath(dest)
dest_name = "GDCVideo" if os.path.split(dest)[1] == "" else os.path.split(dest)[1]
# Step 0: Check dependencies.
check_dependencies(force)
# Step 1: Extract the following from the URL: player URL, SWF URL, XML URL.
data = retrieve_data_from_base_url(url, force, os.path.join(dest_path, dest_name + "-player-url.txt"))
# Step 2: Parse the XML and extract the speaker video URL, slide video URL, and metadata.
metadata = parse_xml_from_url(data["xml_url"], os.path.join(dest_path, dest_name + ".xml"))
# Step 3: Download the videos.
download_video(metadata["akamai"], metadata["slide"], data["swf_url"], data["player_url"], os.path.join(dest_path, dest_name + "-slide.flv"))
download_video(metadata["akamai"], metadata["speaker"], data["swf_url"], data["player_url"], os.path.join(dest_path, dest_name + "-speaker.flv"))
for code in metadata["audio"]:
download_video(metadata["akamai"], metadata["audio"][code], data["swf_url"], data["player_url"], os.path.join(dest_path, dest_name + "-audio-" + code + ".flv"))
message("All done!")
if __name__ == "__main__":
if len(sys.argv) >= 2:
try:
force = (sys.argv[1] == "-f")
offset = 1 if force else 0
if ((len(sys.argv) == 2) or (force and (len(sys.argv) == 3))):
download_gdc_video_at_url(sys.argv[1+offset], "", force)
elif ((len(sys.argv) == 3) or (force and (len(sys.argv) == 4))):
download_gdc_video_at_url(sys.argv[1+offset], sys.argv[2+offset], force)
else:
error("invalid number of arguments")
except KeyboardInterrupt:
error("program interrupted")
else:
error("invalid number of arguments")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment