albingeorge/youtube_channel_download.py

## youtube_channel_download.py
import mechanize
import cookielib
from bs4 import BeautifulSoup
import re, urllib2, sys, json, os, resource
from pprint import pprint
from urlparse import urlparse, parse_qs

class Browser:
    """A browser to use to crawl the site"""

    def __init__(self, user_agent):
        self.browser = mechanize.Browser()
        # Cookie Jar
        cj = cookielib.LWPCookieJar()
        self.browser.set_cookiejar(cj)
        # Browser options
        self.browser.set_handle_equiv(True)
        # self.browser.set_handle_gzip(True)
        self.browser.set_handle_redirect(True)
        self.browser.set_handle_referer(True)
        self.browser.set_handle_robots(False)
        # Follows refresh 0 but not hangs on refresh > 0
        self.browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
        self.browser.addheaders = [('User-agent', user_agent)]

    def get_html_from_url(self, url):
        r = self.browser.open(url)
        return r.read()


url = sys.argv[1]
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:29.0) Gecko/20100101 Firefox/29.0'
browser = Browser(user_agent)
html = browser.get_html_from_url(url)


soup = BeautifulSoup(html)
playlist = soup.findAll('li', {"class" : "yt-uix-scroller-scroll-unit"})
urls = []
for listelement in playlist:
    for a in listelement.find_all('a', href=True):
        urls.append("https://www.youtube.com" + a['href'])
# Yaay!! Got the urls

for url in urls:
    print "Resource used: " + str(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) + "!!!"
    html = browser.get_html_from_url(url)
    soup = BeautifulSoup(html)
    title = soup.title.string
    title = re.sub(r'\W+', '_', title.title())
    target_file_name = title + ".flv"
    if os.path.isfile(os.getcwd() + "/" + target_file_name) and os.stat(os.getcwd() + "/" + target_file_name).st_size != 0:
        print "Ignoring downloading of the file " + target_file_name
        continue
    test = [elem.get_text() for elem in soup.findAll(['script']) if elem.get_text().startswith('var ytplayer')]
    test = test[0]

    start_string = 'ytplayer.config = '
    result = test[test.find(start_string) + len(start_string) : test.find(';(function() {var encoded')]
    results = json.loads(result)
    videos = results['args']['url_encoded_fmt_stream_map']
    queries_as_list = parse_qs(videos)
    with open ('testfile.html', 'w') as f: f.write ('')
    with open('testfile.html', 'wt') as out:
        pprint(queries_as_list, stream=out)
    itags = []
    for url in queries_as_list['url']:
        itags.append(re.search('itag\=([0-9]+)',url).group(1))
    itags = map(int, itags)
    index_max_itag = itags.index(max(itags))
    url = queries_as_list['url'][index_max_itag]

    # Download file
    tofile = open("./" + target_file_name,"w")
    print "Downloading " + target_file_name
    try:
      r = browser.browser.open(url).read()
      tofile.write(r)
    except:
      print "Download failed for file " + target_file_name
    tofile.close()
	import mechanize
	import cookielib
	from bs4 import BeautifulSoup
	import re, urllib2, sys, json, os, resource
	from pprint import pprint
	from urlparse import urlparse, parse_qs

	class Browser:
	"""A browser to use to crawl the site"""

	def __init__(self, user_agent):
	self.browser = mechanize.Browser()
	# Cookie Jar
	cj = cookielib.LWPCookieJar()
	self.browser.set_cookiejar(cj)
	# Browser options
	self.browser.set_handle_equiv(True)
	# self.browser.set_handle_gzip(True)
	self.browser.set_handle_redirect(True)
	self.browser.set_handle_referer(True)
	self.browser.set_handle_robots(False)
	# Follows refresh 0 but not hangs on refresh > 0
	self.browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
	self.browser.addheaders = [('User-agent', user_agent)]

	def get_html_from_url(self, url):
	r = self.browser.open(url)
	return r.read()


	url = sys.argv[1]
	user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:29.0) Gecko/20100101 Firefox/29.0'
	browser = Browser(user_agent)
	html = browser.get_html_from_url(url)




	soup = BeautifulSoup(html)
	playlist = soup.findAll('li', {"class" : "yt-uix-scroller-scroll-unit"})
	urls = []
	for listelement in playlist:
	for a in listelement.find_all('a', href=True):
	urls.append("https://www.youtube.com" + a['href'])
	# Yaay!! Got the urls

	for url in urls:
	print "Resource used: " + str(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) + "!!!"
	html = browser.get_html_from_url(url)
	soup = BeautifulSoup(html)
	title = soup.title.string
	title = re.sub(r'\W+', '_', title.title())
	target_file_name = title + ".flv"
	if os.path.isfile(os.getcwd() + "/" + target_file_name) and os.stat(os.getcwd() + "/" + target_file_name).st_size != 0:
	print "Ignoring downloading of the file " + target_file_name
	continue
	test = [elem.get_text() for elem in soup.findAll(['script']) if elem.get_text().startswith('var ytplayer')]
	test = test[0]

	start_string = 'ytplayer.config = '
	result = test[test.find(start_string) + len(start_string) : test.find(';(function() {var encoded')]
	results = json.loads(result)
	videos = results['args']['url_encoded_fmt_stream_map']
	queries_as_list = parse_qs(videos)
	with open ('testfile.html', 'w') as f: f.write ('')
	with open('testfile.html', 'wt') as out:
	pprint(queries_as_list, stream=out)
	itags = []
	for url in queries_as_list['url']:
	itags.append(re.search('itag\=([0-9]+)',url).group(1))
	itags = map(int, itags)
	index_max_itag = itags.index(max(itags))
	url = queries_as_list['url'][index_max_itag]

	# Download file
	tofile = open("./" + target_file_name,"w")
	print "Downloading " + target_file_name
	try:
	r = browser.browser.open(url).read()
	tofile.write(r)
	except:
	print "Download failed for file " + target_file_name
	tofile.close()