Skip to content

Instantly share code, notes, and snippets.

@armonge
Created March 15, 2012 00:16
Show Gist options
  • Save armonge/2040636 to your computer and use it in GitHub Desktop.
Save armonge/2040636 to your computer and use it in GitHub Desktop.
Get a list of all the youtube download URL's for PyCon2012
from bs4 import BeautifulSoup
from urlparse import parse_qs, urlparse
from urlparse import unquote, urlparse, parse_qs
from concurrent.futures import ProcessPoolExecutor
import urllib
import sys
video_list = []
sys.setrecursionlimit(10000)
def coroutine(func):
def start(*args,**kwargs):
cr = func(*args,**kwargs)
cr.next()
return cr
return start
def make_video_data_request(link):
try:
parsed_q = parse_qs(urlparse(link).query)
video_id = parsed_q['v'][0]
video_data = parse_qs(unquote(urllib.urlopen('http://www.youtube.com/get_video_info?video_id=%s' % video_id, ).read().decode('utf-8')))
token = video_data['token'][0]
title = video_data['title'][0]
return title, video_data['url_encoded_fmt_stream_map'][0][4:]
except KeyError:
return '', link
@coroutine
def load_page(base, already_loaded):
while True:
path = (yield)
url = base + path
if not url in already_loaded:
filehandle = urllib.urlopen(url)
already_loaded.append(url)
soup = BeautifulSoup((filehandle.read()))
for a in soup.select('#coverage .coverage-video h3 a'):
video_list.append(a['href'])
yield soup.select('.pagination a')
def cycle_gen(pager, url):
print url
result = pager.send(url)
if result:
for p in result:
cycle_gen(pager, p['href'])
if __name__ == '__main__':
pager = load_page('http://lanyrd.com', [])
cycle_gen(pager, '/2012/pycon/video/')
final_video_list = []
with open('video_list.txt','a') as video_list_file:
with ProcessPoolExecutor() as e:
for title, url in e.map(make_video_data_request, set(video_list)):
video_list_file.write(u'wget "%s" -O "%s"\n' % ( url, title))
@armonge
Copy link
Author

armonge commented Mar 15, 2012

By the way you need to have BeautifulSoup4 and Futures installed

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment