Skip to content

Instantly share code, notes, and snippets.

@apit
Created September 6, 2011 05:28
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save apit/1196670 to your computer and use it in GitHub Desktop.
Save apit/1196670 to your computer and use it in GitHub Desktop.
TED video/subtitle/comments downloader
#! /usr/bin/python -u
"""
@author apit - http://fizdoonk.multiply.com/
@require python-lxml, python-simplejson
"""
import os, sys, re, codecs
import simplejson
import urllib2
import lxml.html
from lxml.html.clean import Cleaner
from urllib import urlopen
from subprocess import call
DOWNLOAD_VIDEO = True
OUTPUT_DIR = './out/'
VIDEO_LOWRES = True
BASE_URL = "http://www.ted.com"
SUBTITLE_URL = BASE_URL + "/talks/subtitles/id/%s/lang/%s"
SUBTITLE_HTML = BASE_URL + "/talks/subtitles/id/%s/lang/%s/format/html"
COMMENTS_HTML = BASE_URL + "/comments/viewRPC?forum=%s&commentsLoaded=0&limit=999999&sortby=replies"
DOWNLOADABLE_LANGS = set(['ind', 'eng'])
if 'http_proxy' in os.environ:
proxy = urllib2.ProxyHandler({'http': os.environ['http_proxy']})
opener = urllib2.build_opener(proxy)
urllib2.install_opener(opener)
# from django.template.defaultfilters
def slugify(value):
value = unicode(re.sub('[^\w\s-]', '', value).strip().lower())
return re.sub('[-\s]+', '_', value)
def save_to(filename, content):
f = codecs.open(filename, "w", "utf-8")
f.write(content)
f.close()
def save_jsonfile(filename, json):
f = codecs.open(filename, "w", "utf-8")
f.write(simplejson.dumps(json))
f.close()
def load_jsonfile(filename):
f = codecs.open(filename, 'r', "utf-8")
return simplejson.loads(f.read())
def captions_to_sub(captions, offset):
"""
Convert TED subtitle format into SUB format.
content, startTime, duration, startOfParagraph
"""
i = 1
sub = ''
for c in captions:
c['startTime'] += int(offset)
min, ms = divmod(c['startTime'], 60000)
ms, rest = divmod(ms, 1000)
minTo, msTo = divmod(c['startTime'] + c['duration'], 60000)
msTo, restTo = divmod(msTo, 1000)
# talk is less than 1 hour, thus 00 prefix ;p
sub += "%d%s00:%s:%s,%s --> " % (i, os.linesep, str(min).zfill(2), str(ms).zfill(2), str(rest).zfill(3))
sub += "00:%s:%s,%s%s" % (str(minTo).zfill(2), str(msTo).zfill(2), str(restTo).zfill(3), os.linesep)
sub += c['content']
sub += os.linesep + os.linesep
i += 1
return sub
def fetch_subtitle(id, lang, offset):
print " > fetching subtitle (%s)... " % lang.upper()
subtitle_url = SUBTITLE_URL % (id, lang)
subtitle_html = SUBTITLE_HTML % (id, lang)
f = urllib2.urlopen(SUBTITLE_URL % (id, lang))
json = unicode(f.read(), "utf-8")
subtitle = captions_to_sub(simplejson.loads(json)['captions'], offset)
f = urllib2.urlopen(subtitle_html)
cleaner = Cleaner(remove_tags=['a'])
subtitle_html = unicode(cleaner.clean_html(f.read()), "utf-8")
return [subtitle, subtitle_html]
def fetch_comments(id):
print " > fetching comments..."
wrapper = """
<html>
<body>
<link rel="stylesheet" type="text/css" media="screen" href="http://www.ted.com/css/comments.css" />
<div id="conversations">
<div id="discussion" class="silverBorderPanel">
<div class="clearfix"></div>
<h3>Comment on this Talk</h3>
<div class="clearfix">&nbsp;</div>
<div class="comment-container">%s</div>
</div>
</div></body></html>
"""
try:
f = urllib2.urlopen(COMMENTS_HTML % id)
return wrapper % unicode(f.read(), "utf-8")
except:
return ''
def parse_info(html):
"""
Parse requested page and look for title, talk id, and languages transcripted.
"""
filepattern = r'/talks/([_\w]+\-light\.mp4)' if VIDEO_LOWRES \
else r'/talks/([_\w]+\.mp4)'
match = re.search(filepattern, html)
if not match:
print "No video link"
sys.exit(-1)
download_url = "http://download.ted.com/talks/%s" % match.group(1)
title = re.search(r'altheadline\ \=\ \'([^\']+)\'', html).group(1)
offset = re.search(r'introDuration:(\d+)', html).group(1)
id = re.search(r'ti:\"(\d+)\"', html).group(1)
forum_id = re.search(r'forum:\ (\d+)', html).group(1)
doc = lxml.html.fromstring(html)
select = doc.cssselect("#languageCode")[0]
langs = select.value_options
return {'id':id,
'forum_id': forum_id,
'title': title,
'offset': offset,
'langs': langs,
'download_url': download_url
}
def fetch_page_info(url, offset, file_pattern):
offset = offset * 1000 # in second
f = urllib2.urlopen(url)
html = f.read()
info = parse_info(html)
for lang in set(info['langs']).intersection(DOWNLOADABLE_LANGS):
subtitle, subtitle_html = fetch_subtitle(info['id'], lang, info['offset'])
save_to(file_pattern % (lang, "sub"), subtitle)
save_to(file_pattern % (lang, "html"), subtitle_html)
info[lang] = {'subtitle': subtitle, 'subtitle_html': subtitle_html}
comments = fetch_comments(info['forum_id'])
info['comments'] = comments
save_to(file_pattern % ("all", "comments.html"), comments)
return info
def download(url, filename):
status = call('/usr/bin/wget -c -O %s %s' % (filename, url), shell=True)
def get_and_save(url, output_dir, offset):
print
print "Getting", url
title = slugify(url.split('/')[-1].replace('.html', ''))
file_pattern = "%s%s-%%s.%%s" % (output_dir, title)
cachefile = "%s%s.%s" % (output_dir, title, "info")
try:
video_info = load_jsonfile(cachefile)
except IOError:
video_info = fetch_page_info(url, offset, file_pattern)
save_jsonfile(cachefile, video_info)
# save_all(video_info) # TODO move all save_to into this
if DOWNLOAD_VIDEO:
download(video_info['download_url'], file_pattern % ("en", "mp4"))
def main_pipe():
urls = sys.stdin.read()
for url in [u.strip() for u in urls.split(os.linesep) if u]:
try:
if not url.startswith('#'):
get_and_save(url, OUTPUT_DIR, 0)
except:
print "Failed to fetch %s" % url
def main():
if len(sys.argv) < 2:
print "TED.com video/subtitle downloader"
print "Usage #1: %s TALK-URL [offset in seconds]" % sys.argv[0]
print "Usage #2: %s < video-list.txt" % sys.argv[0]
print "Eg. %s http://www.ted.com/talks/james_nachtwey_s_searing_pictures_of_war.html" % sys.argv[0]
print "Supply [offset] to override one specified by the page (sometimes wrong)."
sys.exit(2)
try:
offset = sys.argv[2]
except:
offset = 0
try:
get_and_save(sys.argv[1], OUTPUT_DIR, offset)
except:
raise
if __name__ == '__main__':
if not sys.stdin.isatty():
main_pipe()
else:
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment