Skip to content

Instantly share code, notes, and snippets.

@albingeorge
Created July 24, 2014 07:12
Show Gist options
  • Save albingeorge/fd11b65baf15f9e0395d to your computer and use it in GitHub Desktop.
Save albingeorge/fd11b65baf15f9e0395d to your computer and use it in GitHub Desktop.
Download all videos from a youtube channel
import mechanize
import cookielib
from bs4 import BeautifulSoup
import re, urllib2, sys, json, os, resource
from pprint import pprint
from urlparse import urlparse, parse_qs
class Browser:
"""A browser to use to crawl the site"""
def __init__(self, user_agent):
self.browser = mechanize.Browser()
# Cookie Jar
cj = cookielib.LWPCookieJar()
self.browser.set_cookiejar(cj)
# Browser options
self.browser.set_handle_equiv(True)
# self.browser.set_handle_gzip(True)
self.browser.set_handle_redirect(True)
self.browser.set_handle_referer(True)
self.browser.set_handle_robots(False)
# Follows refresh 0 but not hangs on refresh > 0
self.browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
self.browser.addheaders = [('User-agent', user_agent)]
def get_html_from_url(self, url):
r = self.browser.open(url)
return r.read()
url = sys.argv[1]
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:29.0) Gecko/20100101 Firefox/29.0'
browser = Browser(user_agent)
html = browser.get_html_from_url(url)
soup = BeautifulSoup(html)
playlist = soup.findAll('li', {"class" : "yt-uix-scroller-scroll-unit"})
urls = []
for listelement in playlist:
for a in listelement.find_all('a', href=True):
urls.append("https://www.youtube.com" + a['href'])
# Yaay!! Got the urls
for url in urls:
print "Resource used: " + str(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) + "!!!"
html = browser.get_html_from_url(url)
soup = BeautifulSoup(html)
title = soup.title.string
title = re.sub(r'\W+', '_', title.title())
target_file_name = title + ".flv"
if os.path.isfile(os.getcwd() + "/" + target_file_name) and os.stat(os.getcwd() + "/" + target_file_name).st_size != 0:
print "Ignoring downloading of the file " + target_file_name
continue
test = [elem.get_text() for elem in soup.findAll(['script']) if elem.get_text().startswith('var ytplayer')]
test = test[0]
start_string = 'ytplayer.config = '
result = test[test.find(start_string) + len(start_string) : test.find(';(function() {var encoded')]
results = json.loads(result)
videos = results['args']['url_encoded_fmt_stream_map']
queries_as_list = parse_qs(videos)
with open ('testfile.html', 'w') as f: f.write ('')
with open('testfile.html', 'wt') as out:
pprint(queries_as_list, stream=out)
itags = []
for url in queries_as_list['url']:
itags.append(re.search('itag\=([0-9]+)',url).group(1))
itags = map(int, itags)
index_max_itag = itags.index(max(itags))
url = queries_as_list['url'][index_max_itag]
# Download file
tofile = open("./" + target_file_name,"w")
print "Downloading " + target_file_name
try:
r = browser.browser.open(url).read()
tofile.write(r)
except:
print "Download failed for file " + target_file_name
tofile.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment