Skip to content

Instantly share code, notes, and snippets.

@selfboot
Created May 19, 2013 15:28
Show Gist options
  • Save selfboot/5607996 to your computer and use it in GitHub Desktop.
Save selfboot/5607996 to your computer and use it in GitHub Desktop.
TED 视频下载: 获取指定标签下的所有带英文字幕的视频的下载url,然后用`curl -L -C - -O url`下载视频。
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import re
keywords_pattern = re.compile(r".*/([^.]+).mp4*")
subtitles_url_pattern = re.compile(r'<option value="en">English</option>')
base_url = "http://www.ted.com"
tags_url = "/talks/tags"
mp4_pre_url = "http://www.ted.com/download/links/slug/"
mp4_post_url = "/type/talks/ext/mp4"
media_pre_url = "http://download.ted.com/talks/"
media_post_subtitles = "-480p-en.mp4?apikey=TEDDOWNLOAD"
tag_name_baseurl = "http://www.ted.com/talks/tags/name/"
tags_dict = {}
def get_all_tags():
all_tags_req = requests.get(base_url + tags_url)
all_tags_soup = BeautifulSoup(all_tags_req.content)
maincontent_tag = all_tags_soup.find('div', id="maincontent")
all_tags = maincontent_tag.find_all('a')
for single_tag in all_tags:
tag_name = single_tag["href"][12:]
tag_sum_str = single_tag.string
brackets_start = tag_sum_str.find("(")
brackets_end = tag_sum_str.find(")", brackets_start+1)
tag_sum = tag_sum_str[brackets_start+1:brackets_end]
tags_dict[tag_name] = tag_sum
def get_tag_url_pages(tag_name):
"""
return [tag_base_url, pages count]
"""
tag_name_url = "%s%s/page/" % (tag_name_baseurl, tag_name)
pages_count = (int(tags_dict[tag_name]) - 1)/12 + 1
return tag_name_url, pages_count
####
def get_media_html(tag_page_url):
media_html_list = []
media_html_req = requests.get(tag_page_url)
media_html_soup = BeautifulSoup(media_html_req.content)
all_media_tags = media_html_soup.find_all('dl', class_="clearfix")
for media_tag in all_media_tags:
media_html_tag = media_tag.find('a')
media_html = base_url + media_html_tag["href"]
media_html_list.append(media_html)
return media_html_list
def get_media_keyword(media_html):
'''
if we can download the media then return the media's abbreviate name
else return NULL. Such as:
http://www.ted.com/talks/steve_jobs_how_to_live_before_you_die.html
'''
media_html_req = requests.get(media_html)
media_html_tag = BeautifulSoup(media_html_req.content)
keywords_tag = media_html_tag.find('a', id="no-flash-video-download")
if keywords_tag:
keywords_str = keywords_tag["href"]
keywords = keywords_pattern.search(keywords_str).groups()[0]
return keywords
else:
return
def get_media_url(media_html):
keywords = get_media_keyword(media_html)
if not keywords:
return
subtitles_url = "%s%s%s" % (mp4_pre_url, keywords, mp4_post_url)
subtitles_req = requests.get(subtitles_url)
if subtitles_url_pattern.search(subtitles_req.content):
return "%s%s%s" % (media_pre_url, keywords, media_post_subtitles)
else:
return
def get_tag_medias(pre_url, pages_count):
for i in range(1, pages_count + 1):
tag_page_url = pre_url + str(i)
# print tag_page_url
media_html_list = get_media_html(tag_page_url)
for media_html in media_html_list:
media_url = get_media_url(media_html)
if media_url:
print "-O", media_url
if __name__ == "__main__":
get_all_tags()
tags_str = "marketing web wikipedia work"
# tags_str = raw_input("input tags:")
for tag_name in tags_str.split():
if tag_name in tags_dict:
print tag_name
pre_url, pages_count = get_tag_url_pages(tag_name)
get_tag_medias(pre_url, pages_count)
else:
print "no such tag: ", tag_name
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment