Created
May 19, 2013 15:28
-
-
Save selfboot/5607996 to your computer and use it in GitHub Desktop.
TED 视频下载: 获取指定标签下的所有带英文字幕的视频的下载url,然后用`curl -L -C - -O url`下载视频。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import requests | |
from bs4 import BeautifulSoup | |
import re | |
keywords_pattern = re.compile(r".*/([^.]+).mp4*") | |
subtitles_url_pattern = re.compile(r'<option value="en">English</option>') | |
base_url = "http://www.ted.com" | |
tags_url = "/talks/tags" | |
mp4_pre_url = "http://www.ted.com/download/links/slug/" | |
mp4_post_url = "/type/talks/ext/mp4" | |
media_pre_url = "http://download.ted.com/talks/" | |
media_post_subtitles = "-480p-en.mp4?apikey=TEDDOWNLOAD" | |
tag_name_baseurl = "http://www.ted.com/talks/tags/name/" | |
tags_dict = {} | |
def get_all_tags(): | |
all_tags_req = requests.get(base_url + tags_url) | |
all_tags_soup = BeautifulSoup(all_tags_req.content) | |
maincontent_tag = all_tags_soup.find('div', id="maincontent") | |
all_tags = maincontent_tag.find_all('a') | |
for single_tag in all_tags: | |
tag_name = single_tag["href"][12:] | |
tag_sum_str = single_tag.string | |
brackets_start = tag_sum_str.find("(") | |
brackets_end = tag_sum_str.find(")", brackets_start+1) | |
tag_sum = tag_sum_str[brackets_start+1:brackets_end] | |
tags_dict[tag_name] = tag_sum | |
def get_tag_url_pages(tag_name): | |
""" | |
return [tag_base_url, pages count] | |
""" | |
tag_name_url = "%s%s/page/" % (tag_name_baseurl, tag_name) | |
pages_count = (int(tags_dict[tag_name]) - 1)/12 + 1 | |
return tag_name_url, pages_count | |
#### | |
def get_media_html(tag_page_url): | |
media_html_list = [] | |
media_html_req = requests.get(tag_page_url) | |
media_html_soup = BeautifulSoup(media_html_req.content) | |
all_media_tags = media_html_soup.find_all('dl', class_="clearfix") | |
for media_tag in all_media_tags: | |
media_html_tag = media_tag.find('a') | |
media_html = base_url + media_html_tag["href"] | |
media_html_list.append(media_html) | |
return media_html_list | |
def get_media_keyword(media_html): | |
''' | |
if we can download the media then return the media's abbreviate name | |
else return NULL. Such as: | |
http://www.ted.com/talks/steve_jobs_how_to_live_before_you_die.html | |
''' | |
media_html_req = requests.get(media_html) | |
media_html_tag = BeautifulSoup(media_html_req.content) | |
keywords_tag = media_html_tag.find('a', id="no-flash-video-download") | |
if keywords_tag: | |
keywords_str = keywords_tag["href"] | |
keywords = keywords_pattern.search(keywords_str).groups()[0] | |
return keywords | |
else: | |
return | |
def get_media_url(media_html): | |
keywords = get_media_keyword(media_html) | |
if not keywords: | |
return | |
subtitles_url = "%s%s%s" % (mp4_pre_url, keywords, mp4_post_url) | |
subtitles_req = requests.get(subtitles_url) | |
if subtitles_url_pattern.search(subtitles_req.content): | |
return "%s%s%s" % (media_pre_url, keywords, media_post_subtitles) | |
else: | |
return | |
def get_tag_medias(pre_url, pages_count): | |
for i in range(1, pages_count + 1): | |
tag_page_url = pre_url + str(i) | |
# print tag_page_url | |
media_html_list = get_media_html(tag_page_url) | |
for media_html in media_html_list: | |
media_url = get_media_url(media_html) | |
if media_url: | |
print "-O", media_url | |
if __name__ == "__main__": | |
get_all_tags() | |
tags_str = "marketing web wikipedia work" | |
# tags_str = raw_input("input tags:") | |
for tag_name in tags_str.split(): | |
if tag_name in tags_dict: | |
print tag_name | |
pre_url, pages_count = get_tag_url_pages(tag_name) | |
get_tag_medias(pre_url, pages_count) | |
else: | |
print "no such tag: ", tag_name |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment