handyman5/ds_downloader.py

## ds_downloader.py
#!/usr/bin/python
#
# Notes
# -----
# This script depends on rtmpdump and mythnettv; specify their paths in the global variables below.
# If you don't need mythnettv support, leave it undefined (MYTHNETTV="") and the script will skip that step
# svn update -r275 # rtmpdump
#

import re,threading
from os import stat, popen
from sys import exit
from urllib import urlopen
from optparse import OptionParser
from lxml import etree
from re import escape
import logging

GEN_URL="http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?uri=mgid:cms:episode:thedailyshow.com:%d"
MYTHNETTV="mythnettv"
RTMPDUMP="rtmpdump"
RTMPDUMP_OPTS="-W http://media.mtvnservices.com/player/release/?v=4.4.6 -q"

def get_media_id(page):
    matches = re.findall('mgid:cms:[^:]+:comedycentral.com:[0-9]+', page)
    logging.debug('get_media_id:: matches[0] = %s' % matches[0])
    media_id = int(matches[0].split(':')[-1])
    logging.info('get_media_id:: media_id = %d' % media_id)
    return media_id

def get_metadata(page):
    html = etree.HTML(page)
    head = html.find('head')
    body = html.find('body')

    title = body.xpath("//div[@class='showName']")[0].text.strip()
    subtitle = body.xpath("//h1[@class='subTitle']")[0].text.strip()
    description = head.xpath("//meta[@name='description']")[0].values()[1]
    logging.debug('get_metadata:: %s: %s: %s' % (title, subtitle, description))
    return (title, subtitle, description)

def get_episode_list():
    urls = { 'The Daily Show': "http://www.thedailyshow.com/full-episodes", 'Colbert Report': "http://www.colbertnation.com/full-episodes/" }

    episodes = []
    for (title,url) in urls.items():
        page_data = urlopen(url).read()
        anchors = etree.HTML(page_data).xpath("//span[@class='date']/a")
        for item in anchors:
            logging.debug('get_episode_list:: item.text = %s, item.href = %s' % (item.text, item.attrib['href']))
            episodes.append( ( title, item.text, item.attrib['href'] ) )

    return episodes


def get_media_files(id_num):
    output = urlopen(GEN_URL % id_num).read()
    urls = re.findall('<src>(.*\.mp4)</src>', output)
    logging.debug('get_media_files:: urls = %s' % '\n'.join(urls))
    return urls


def merge_files(id_num, filenames):
    cmd = "mencoder -really-quiet -oac mp3lame -ovc lavc -lavcopts vcodec=mpeg4:vbitrate=1800 -o output_%d.avi " % id_num
    for x in filenames: cmd += " %s" % x

    try:
        stat("output_%d.avi" % id_num)
    except OSError:
        logging.debug('merge_files:: cmd = %s' % cmd)
        popen(cmd)
    return "output_%d.avi" % id_num

########################################

class Downloader(threading.Thread):
    def __init__(self, url):
        self.rtmp = url
        self.output = ""
        threading.Thread.__init__(self)

    def download_rtmp_url(self):
        filename = self.rtmp.split("/")[-1].replace("mp4", "flv")
        cmd = RTMPDUMP + " " + RTMPDUMP_OPTS + " -o " + filename + " -r " + self.rtmp
        logging.debug('download_rtmp_url:: cmd = %s' % cmd)

        try:
            stat(filename)
        except OSError:
            popen(cmd)
        return filename

    def run (self):
        if self.rtmp.find('sixty') >= 0:
            self.output = ""
            return
        if self.rtmp.find('sting') >= 0:
            self.output = ""
            return

        logging.debug('download_rtmp_url:: output = %s' % self.rtmp)

        self.output = self.download_rtmp_url()

########################################

if __name__ == '__main__':
    usage = '''
%prog http://www.thedailyshow.com/full-episodes/date-and-guest
%prog http://www.colbertnation.com/full-episodes/date-and-title-of-the-episode
%prog http://media.mtvnservices.com/mgid:cms:item:comedycentral.com:{ID}
%prog http://media.mtvnservices.com/mgid:cms:video:comedycentral.com:{ID}
%prog http://media.mtvnservices.com/mgid:cms:fullepisode:comedycentral.com:{ID}
'''
    parser = OptionParser(usage)
    parser.add_option("-l", "--list", action="store_true", dest="list", default=False, help="show a list of this week's episodes to select from")
    parser.add_option("-d", "--debug", action="store_true", dest="debug", default=False, help="specify log level (info, warn, debug, etc.)")

    (options, args) = parser.parse_args()
    if len(args) == 0 and not options.list:
        parser.print_help()
        exit(1)

    loglevel = getattr(logging, options.debug and "DEBUG" or "INFO".upper(), None)
    logging.basicConfig(level=loglevel)

    url = ""
    if options.list:
        episodes = get_episode_list()
        index = 0
        for episode in episodes:
            print "%d: %s - %s" % (index, episode[0], episode[1])
            index = index + 1

        selected = raw_input('Enter an episode number: ')
        url = episodes[int(selected)][2]
    else:
        url = args[0]

    page_data = urlopen(url).read()

    id_num = get_media_id(page_data)

    (title, subtitle, description) = get_metadata(page_data)

    threads = []
    for x in range(1,5):
        url = get_media_files(id_num + x)[-1]
        d = Downloader(url)
        threads.append(d)
        d.start()

#    urls = get_media_files(id_num)

#    threads = []
#    for url in urls:
#        d = Downloader(url)
#        threads.append(d)
#        d.start()

    results_new = []
    for thread in threads:
        thread.join()
        output = thread.output
        logging.debug('downloader output: %s' % thread.output)
        results_new.append(output)

    logging.debug('\n'.join(results_new))

    logging.debug('Merging files for id %d' % id_num)
    result = merge_files(id_num, results_new)

    if MYTHNETTV != "":
        cmd = MYTHNETTV + ' importlocal %s "%s" "%s" "%s"' % (result, title, subtitle, escape(description))
        logging.debug('Importing into MythTV; command = %s' % cmd)
        popen(cmd)
	#!/usr/bin/python
	#
	# Notes
	# -----
	# This script depends on rtmpdump and mythnettv; specify their paths in the global variables below.
	# If you don't need mythnettv support, leave it undefined (MYTHNETTV="") and the script will skip that step
	# svn update -r275 # rtmpdump
	#

	import re,threading
	from os import stat, popen
	from sys import exit
	from urllib import urlopen
	from optparse import OptionParser
	from lxml import etree
	from re import escape
	import logging

	GEN_URL="http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?uri=mgid:cms:episode:thedailyshow.com:%d"
	MYTHNETTV="mythnettv"
	RTMPDUMP="rtmpdump"
	RTMPDUMP_OPTS="-W http://media.mtvnservices.com/player/release/?v=4.4.6 -q"

	def get_media_id(page):
	matches = re.findall('mgid:cms:[^:]+:comedycentral.com:[0-9]+', page)
	logging.debug('get_media_id:: matches[0] = %s' % matches[0])
	media_id = int(matches[0].split(':')[-1])
	logging.info('get_media_id:: media_id = %d' % media_id)
	return media_id

	def get_metadata(page):
	html = etree.HTML(page)
	head = html.find('head')
	body = html.find('body')

	title = body.xpath("//div[@class='showName']")[0].text.strip()
	subtitle = body.xpath("//h1[@class='subTitle']")[0].text.strip()
	description = head.xpath("//meta[@name='description']")[0].values()[1]
	logging.debug('get_metadata:: %s: %s: %s' % (title, subtitle, description))
	return (title, subtitle, description)

	def get_episode_list():
	urls = { 'The Daily Show': "http://www.thedailyshow.com/full-episodes", 'Colbert Report': "http://www.colbertnation.com/full-episodes/" }

	episodes = []
	for (title,url) in urls.items():
	page_data = urlopen(url).read()
	anchors = etree.HTML(page_data).xpath("//span[@class='date']/a")
	for item in anchors:
	logging.debug('get_episode_list:: item.text = %s, item.href = %s' % (item.text, item.attrib['href']))
	episodes.append( ( title, item.text, item.attrib['href'] ) )

	return episodes


	def get_media_files(id_num):
	output = urlopen(GEN_URL % id_num).read()
	urls = re.findall('<src>(.*\.mp4)</src>', output)
	logging.debug('get_media_files:: urls = %s' % '\n'.join(urls))
	return urls


	def merge_files(id_num, filenames):
	cmd = "mencoder -really-quiet -oac mp3lame -ovc lavc -lavcopts vcodec=mpeg4:vbitrate=1800 -o output_%d.avi " % id_num
	for x in filenames: cmd += " %s" % x

	try:
	stat("output_%d.avi" % id_num)
	except OSError:
	logging.debug('merge_files:: cmd = %s' % cmd)
	popen(cmd)
	return "output_%d.avi" % id_num

	########################################

	class Downloader(threading.Thread):
	def __init__(self, url):
	self.rtmp = url
	self.output = ""
	threading.Thread.__init__(self)

	def download_rtmp_url(self):
	filename = self.rtmp.split("/")[-1].replace("mp4", "flv")
	cmd = RTMPDUMP + " " + RTMPDUMP_OPTS + " -o " + filename + " -r " + self.rtmp
	logging.debug('download_rtmp_url:: cmd = %s' % cmd)

	try:
	stat(filename)
	except OSError:
	popen(cmd)
	return filename

	def run (self):
	if self.rtmp.find('sixty') >= 0:
	self.output = ""
	return
	if self.rtmp.find('sting') >= 0:
	self.output = ""
	return

	logging.debug('download_rtmp_url:: output = %s' % self.rtmp)

	self.output = self.download_rtmp_url()

	########################################

	if __name__ == '__main__':
	usage = '''
	%prog http://www.thedailyshow.com/full-episodes/date-and-guest
	%prog http://www.colbertnation.com/full-episodes/date-and-title-of-the-episode
	%prog http://media.mtvnservices.com/mgid:cms:item:comedycentral.com:{ID}
	%prog http://media.mtvnservices.com/mgid:cms:video:comedycentral.com:{ID}
	%prog http://media.mtvnservices.com/mgid:cms:fullepisode:comedycentral.com:{ID}
	'''
	parser = OptionParser(usage)
	parser.add_option("-l", "--list", action="store_true", dest="list", default=False, help="show a list of this week's episodes to select from")
	parser.add_option("-d", "--debug", action="store_true", dest="debug", default=False, help="specify log level (info, warn, debug, etc.)")

	(options, args) = parser.parse_args()
	if len(args) == 0 and not options.list:
	parser.print_help()
	exit(1)

	loglevel = getattr(logging, options.debug and "DEBUG" or "INFO".upper(), None)
	logging.basicConfig(level=loglevel)

	url = ""
	if options.list:
	episodes = get_episode_list()
	index = 0
	for episode in episodes:
	print "%d: %s - %s" % (index, episode[0], episode[1])
	index = index + 1

	selected = raw_input('Enter an episode number: ')
	url = episodes[int(selected)][2]
	else:
	url = args[0]

	page_data = urlopen(url).read()

	id_num = get_media_id(page_data)

	(title, subtitle, description) = get_metadata(page_data)

	threads = []
	for x in range(1,5):
	url = get_media_files(id_num + x)[-1]
	d = Downloader(url)
	threads.append(d)
	d.start()

	# urls = get_media_files(id_num)

	# threads = []
	# for url in urls:
	# d = Downloader(url)
	# threads.append(d)
	# d.start()

	results_new = []
	for thread in threads:
	thread.join()
	output = thread.output
	logging.debug('downloader output: %s' % thread.output)
	results_new.append(output)

	logging.debug('\n'.join(results_new))

	logging.debug('Merging files for id %d' % id_num)
	result = merge_files(id_num, results_new)

	if MYTHNETTV != "":
	cmd = MYTHNETTV + ' importlocal %s "%s" "%s" "%s"' % (result, title, subtitle, escape(description))
	logging.debug('Importing into MythTV; command = %s' % cmd)
	popen(cmd)