apit/get-ted.py

## get-ted.py
#! /usr/bin/python -u

"""
@author apit - http://fizdoonk.multiply.com/
@require python-lxml, python-simplejson
"""

import os, sys, re, codecs
import simplejson
import urllib2
import lxml.html
from lxml.html.clean import Cleaner
from urllib import urlopen
from subprocess import call

DOWNLOAD_VIDEO = True
OUTPUT_DIR = './out/'
VIDEO_LOWRES = True

BASE_URL = "http://www.ted.com"
SUBTITLE_URL = BASE_URL + "/talks/subtitles/id/%s/lang/%s"
SUBTITLE_HTML = BASE_URL + "/talks/subtitles/id/%s/lang/%s/format/html"
COMMENTS_HTML = BASE_URL + "/comments/viewRPC?forum=%s&commentsLoaded=0&limit=999999&sortby=replies"
DOWNLOADABLE_LANGS = set(['ind', 'eng'])

if 'http_proxy' in os.environ:
    proxy = urllib2.ProxyHandler({'http': os.environ['http_proxy']})
    opener = urllib2.build_opener(proxy)
    urllib2.install_opener(opener)

# from django.template.defaultfilters
def slugify(value):
    value = unicode(re.sub('[^\w\s-]', '', value).strip().lower())
    return re.sub('[-\s]+', '_', value)

def save_to(filename, content):
    f = codecs.open(filename, "w", "utf-8")
    f.write(content)
    f.close()

def save_jsonfile(filename, json):
    f = codecs.open(filename, "w", "utf-8")
    f.write(simplejson.dumps(json))
    f.close()

def load_jsonfile(filename):
    f = codecs.open(filename, 'r', "utf-8")
    return simplejson.loads(f.read())

def captions_to_sub(captions, offset):
    """
    Convert TED subtitle format into SUB format.
    content, startTime, duration, startOfParagraph
    """
    i = 1
    sub = ''
    for c in captions:
        c['startTime'] += int(offset)
        min, ms = divmod(c['startTime'], 60000)
        ms, rest = divmod(ms, 1000)
        minTo, msTo = divmod(c['startTime'] + c['duration'], 60000)
        msTo, restTo = divmod(msTo, 1000)
        # talk is less than 1 hour, thus 00 prefix ;p

        sub += "%d%s00:%s:%s,%s --> " % (i, os.linesep, str(min).zfill(2), str(ms).zfill(2), str(rest).zfill(3))
        sub += "00:%s:%s,%s%s" % (str(minTo).zfill(2), str(msTo).zfill(2), str(restTo).zfill(3), os.linesep)
        sub += c['content']
        sub += os.linesep + os.linesep
        i += 1
    return sub

def fetch_subtitle(id, lang, offset):
    print " > fetching subtitle (%s)... " % lang.upper()

    subtitle_url = SUBTITLE_URL % (id, lang)
    subtitle_html = SUBTITLE_HTML % (id, lang)

    f = urllib2.urlopen(SUBTITLE_URL % (id, lang))
    json = unicode(f.read(), "utf-8")
    subtitle = captions_to_sub(simplejson.loads(json)['captions'], offset)

    f = urllib2.urlopen(subtitle_html)
    cleaner = Cleaner(remove_tags=['a'])
    subtitle_html = unicode(cleaner.clean_html(f.read()), "utf-8")
    return [subtitle, subtitle_html]

def fetch_comments(id):
    print " > fetching comments..."

    wrapper = """
<html>
<body>
<link rel="stylesheet" type="text/css" media="screen" href="http://www.ted.com/css/comments.css" />
<div id="conversations">
    <div id="discussion" class="silverBorderPanel">
        <div class="clearfix"></div>
        <h3>Comment on this Talk</h3>
        <div class="clearfix">&nbsp;</div>
        <div class="comment-container">%s</div>
    </div>
</div></body></html>
"""
    try:
        f = urllib2.urlopen(COMMENTS_HTML % id)
        return wrapper % unicode(f.read(), "utf-8")
    except:
        return ''

def parse_info(html):
    """
    Parse requested page and look for title, talk id, and languages transcripted.
    """
    filepattern = r'/talks/([_\w]+\-light\.mp4)' if VIDEO_LOWRES \
                else r'/talks/([_\w]+\.mp4)'
    match = re.search(filepattern, html)
    if not match:
        print "No video link"
        sys.exit(-1)

    download_url = "http://download.ted.com/talks/%s" % match.group(1)
    title = re.search(r'altheadline\ \=\ \'([^\']+)\'', html).group(1)
    offset = re.search(r'introDuration:(\d+)', html).group(1)
    id = re.search(r'ti:\"(\d+)\"', html).group(1)
    forum_id = re.search(r'forum:\ (\d+)', html).group(1)

    doc = lxml.html.fromstring(html)
    select = doc.cssselect("#languageCode")[0]
    langs = select.value_options

    return {'id':id,
            'forum_id': forum_id,
            'title': title,
            'offset': offset,
            'langs': langs,
            'download_url': download_url
        }

def fetch_page_info(url, offset, file_pattern):
    offset = offset * 1000 # in second
    f = urllib2.urlopen(url)
    html = f.read()

    info = parse_info(html)
    for lang in set(info['langs']).intersection(DOWNLOADABLE_LANGS):
        subtitle, subtitle_html = fetch_subtitle(info['id'], lang, info['offset'])
        save_to(file_pattern % (lang, "sub"), subtitle)
        save_to(file_pattern % (lang, "html"), subtitle_html)
        info[lang] = {'subtitle': subtitle, 'subtitle_html': subtitle_html}

    comments = fetch_comments(info['forum_id'])
    info['comments'] = comments
    save_to(file_pattern % ("all", "comments.html"), comments)

    return info

def download(url, filename):
    status = call('/usr/bin/wget -c -O %s %s' % (filename, url), shell=True)

def get_and_save(url, output_dir, offset):
    print
    print "Getting", url

    title = slugify(url.split('/')[-1].replace('.html', ''))
    file_pattern = "%s%s-%%s.%%s" % (output_dir, title)
    cachefile = "%s%s.%s" % (output_dir, title, "info")

    try:
        video_info = load_jsonfile(cachefile)
    except IOError:
        video_info = fetch_page_info(url, offset, file_pattern)
        save_jsonfile(cachefile, video_info)

    # save_all(video_info) # TODO move all save_to into this
    if DOWNLOAD_VIDEO:
        download(video_info['download_url'], file_pattern % ("en", "mp4"))

def main_pipe():
    urls = sys.stdin.read()
    for url in [u.strip() for u in urls.split(os.linesep) if u]:
        try:
            if not url.startswith('#'):
                get_and_save(url, OUTPUT_DIR, 0)
        except:
            print "Failed to fetch %s" % url

def main():
    if len(sys.argv) < 2:
        print "TED.com video/subtitle downloader"
        print "Usage #1: %s TALK-URL [offset in seconds]" % sys.argv[0]
        print "Usage #2: %s < video-list.txt" % sys.argv[0]
        print "Eg. %s http://www.ted.com/talks/james_nachtwey_s_searing_pictures_of_war.html" % sys.argv[0]
        print "Supply [offset] to override one specified by the page (sometimes wrong)."
        sys.exit(2)

    try:
        offset = sys.argv[2]
    except:
        offset = 0
    try:
        get_and_save(sys.argv[1], OUTPUT_DIR, offset)
    except:
        raise

if __name__ == '__main__':
    if not sys.stdin.isatty():
        main_pipe()
    else:
        main()
	#! /usr/bin/python -u

	"""
	@author apit - http://fizdoonk.multiply.com/
	@require python-lxml, python-simplejson
	"""

	import os, sys, re, codecs
	import simplejson
	import urllib2
	import lxml.html
	from lxml.html.clean import Cleaner
	from urllib import urlopen
	from subprocess import call

	DOWNLOAD_VIDEO = True
	OUTPUT_DIR = './out/'
	VIDEO_LOWRES = True

	BASE_URL = "http://www.ted.com"
	SUBTITLE_URL = BASE_URL + "/talks/subtitles/id/%s/lang/%s"
	SUBTITLE_HTML = BASE_URL + "/talks/subtitles/id/%s/lang/%s/format/html"
	COMMENTS_HTML = BASE_URL + "/comments/viewRPC?forum=%s&commentsLoaded=0&limit=999999&sortby=replies"
	DOWNLOADABLE_LANGS = set(['ind', 'eng'])

	if 'http_proxy' in os.environ:
	proxy = urllib2.ProxyHandler({'http': os.environ['http_proxy']})
	opener = urllib2.build_opener(proxy)
	urllib2.install_opener(opener)

	# from django.template.defaultfilters
	def slugify(value):
	value = unicode(re.sub('[^\w\s-]', '', value).strip().lower())
	return re.sub('[-\s]+', '_', value)

	def save_to(filename, content):
	f = codecs.open(filename, "w", "utf-8")
	f.write(content)
	f.close()

	def save_jsonfile(filename, json):
	f = codecs.open(filename, "w", "utf-8")
	f.write(simplejson.dumps(json))
	f.close()

	def load_jsonfile(filename):
	f = codecs.open(filename, 'r', "utf-8")
	return simplejson.loads(f.read())

	def captions_to_sub(captions, offset):
	"""
	Convert TED subtitle format into SUB format.
	content, startTime, duration, startOfParagraph
	"""
	i = 1
	sub = ''
	for c in captions:
	c['startTime'] += int(offset)
	min, ms = divmod(c['startTime'], 60000)
	ms, rest = divmod(ms, 1000)
	minTo, msTo = divmod(c['startTime'] + c['duration'], 60000)
	msTo, restTo = divmod(msTo, 1000)
	# talk is less than 1 hour, thus 00 prefix ;p

	sub += "%d%s00:%s:%s,%s --> " % (i, os.linesep, str(min).zfill(2), str(ms).zfill(2), str(rest).zfill(3))
	sub += "00:%s:%s,%s%s" % (str(minTo).zfill(2), str(msTo).zfill(2), str(restTo).zfill(3), os.linesep)
	sub += c['content']
	sub += os.linesep + os.linesep
	i += 1
	return sub

	def fetch_subtitle(id, lang, offset):
	print " > fetching subtitle (%s)... " % lang.upper()

	subtitle_url = SUBTITLE_URL % (id, lang)
	subtitle_html = SUBTITLE_HTML % (id, lang)

	f = urllib2.urlopen(SUBTITLE_URL % (id, lang))
	json = unicode(f.read(), "utf-8")
	subtitle = captions_to_sub(simplejson.loads(json)['captions'], offset)

	f = urllib2.urlopen(subtitle_html)
	cleaner = Cleaner(remove_tags=['a'])
	subtitle_html = unicode(cleaner.clean_html(f.read()), "utf-8")
	return [subtitle, subtitle_html]

	def fetch_comments(id):
	print " > fetching comments..."

	wrapper = """
	<html>
	<body>
	<link rel="stylesheet" type="text/css" media="screen" href="http://www.ted.com/css/comments.css" />
	<div id="conversations">
	<div id="discussion" class="silverBorderPanel">
	<div class="clearfix"></div>
	<h3>Comment on this Talk</h3>
	<div class="clearfix"> </div>
	<div class="comment-container">%s</div>
	</div>
	</div></body></html>
	"""
	try:
	f = urllib2.urlopen(COMMENTS_HTML % id)
	return wrapper % unicode(f.read(), "utf-8")
	except:
	return ''

	def parse_info(html):
	"""
	Parse requested page and look for title, talk id, and languages transcripted.
	"""
	filepattern = r'/talks/([_\w]+\-light\.mp4)' if VIDEO_LOWRES \
	else r'/talks/([_\w]+\.mp4)'
	match = re.search(filepattern, html)
	if not match:
	print "No video link"
	sys.exit(-1)

	download_url = "http://download.ted.com/talks/%s" % match.group(1)
	title = re.search(r'altheadline\ \=\ \'([^\']+)\'', html).group(1)
	offset = re.search(r'introDuration:(\d+)', html).group(1)
	id = re.search(r'ti:\"(\d+)\"', html).group(1)
	forum_id = re.search(r'forum:\ (\d+)', html).group(1)

	doc = lxml.html.fromstring(html)
	select = doc.cssselect("#languageCode")[0]
	langs = select.value_options

	return {'id':id,
	'forum_id': forum_id,
	'title': title,
	'offset': offset,
	'langs': langs,
	'download_url': download_url
	}

	def fetch_page_info(url, offset, file_pattern):
	offset = offset * 1000 # in second
	f = urllib2.urlopen(url)
	html = f.read()

	info = parse_info(html)
	for lang in set(info['langs']).intersection(DOWNLOADABLE_LANGS):
	subtitle, subtitle_html = fetch_subtitle(info['id'], lang, info['offset'])
	save_to(file_pattern % (lang, "sub"), subtitle)
	save_to(file_pattern % (lang, "html"), subtitle_html)
	info[lang] = {'subtitle': subtitle, 'subtitle_html': subtitle_html}

	comments = fetch_comments(info['forum_id'])
	info['comments'] = comments
	save_to(file_pattern % ("all", "comments.html"), comments)

	return info

	def download(url, filename):
	status = call('/usr/bin/wget -c -O %s %s' % (filename, url), shell=True)

	def get_and_save(url, output_dir, offset):
	print
	print "Getting", url

	title = slugify(url.split('/')[-1].replace('.html', ''))
	file_pattern = "%s%s-%%s.%%s" % (output_dir, title)
	cachefile = "%s%s.%s" % (output_dir, title, "info")

	try:
	video_info = load_jsonfile(cachefile)
	except IOError:
	video_info = fetch_page_info(url, offset, file_pattern)
	save_jsonfile(cachefile, video_info)

	# save_all(video_info) # TODO move all save_to into this
	if DOWNLOAD_VIDEO:
	download(video_info['download_url'], file_pattern % ("en", "mp4"))

	def main_pipe():
	urls = sys.stdin.read()
	for url in [u.strip() for u in urls.split(os.linesep) if u]:
	try:
	if not url.startswith('#'):
	get_and_save(url, OUTPUT_DIR, 0)
	except:
	print "Failed to fetch %s" % url

	def main():
	if len(sys.argv) < 2:
	print "TED.com video/subtitle downloader"
	print "Usage #1: %s TALK-URL [offset in seconds]" % sys.argv[0]
	print "Usage #2: %s < video-list.txt" % sys.argv[0]
	print "Eg. %s http://www.ted.com/talks/james_nachtwey_s_searing_pictures_of_war.html" % sys.argv[0]
	print "Supply [offset] to override one specified by the page (sometimes wrong)."
	sys.exit(2)

	try:
	offset = sys.argv[2]
	except:
	offset = 0
	try:
	get_and_save(sys.argv[1], OUTPUT_DIR, offset)
	except:
	raise

	if __name__ == '__main__':
	if not sys.stdin.isatty():
	main_pipe()
	else:
	main()