pudquick/mtv.py

## mtv.py
import re, requests, HTMLParser, os, pipes
from xml.dom.minidom import parseString

global all_series

def process_show_list(show_list):
    split_boundary  = '<li class="group">'
    show_name = r'meta content="([^"]+)" itemprop="name"'
    show_url  = r'<a itemprop="" href="([^"]+)">'
    found_shows = set()
    meta_data = show_list.replace('\n', ' ').split(split_boundary)[1:]
    for an_ep in meta_data:
        # print an_ep
        s_name   = re.search(show_name, an_ep).group(1)
        try:
            s_url    = re.search(show_url,  an_ep).group(1)
        except:
            # Apparently lack of a show URL is allowed
            s_url    = ''
        s_series = ['', 'series'][s_url.lower().endswith('series.jhtml')]
        found_shows.add(tuple([s_name, s_url, s_series]))
    return tuple(sorted(found_shows))

def get_all_series():
    global all_series
    all_series = dict()
    source = 'http://www.mtv.com/ontv/all/?page=%s'
    start  = '<ol class="lst ">'
    end    = '</ol>'
    count  = 0
    sanity_max = 20
    show_pages = set()
    keep_going = True
    while keep_going and (count < sanity_max):
        count += 1
        series_page = requests.get(source % count)
        show_list = series_page.content.split(start,1)[-1].split(end,1)[0]
        found_shows = process_show_list(show_list)
        if found_shows in show_pages:
            # We're repeating ourselves, time to exit
            keep_going = False
        else:
            show_pages.add(found_shows)
    total_shows = sorted(set([item for sublist in list(show_pages) for item in sublist]))
    for series in [x for x in total_shows if x[2] == "series"]:
        all_series[series[0]] = series[1]

def friendly_name(e_name, e_s, e_id):
    if e_id:
        return '%s - %s' % (e_id, e_name)
    elif e_s:
        return 'Season %s - %s' % (e_s, e_name)
    return e_name

def get_full_episodes(series_name):
    global all_series
    base  = 'http://www.mtv.com%s'
    start = '<div id="videoModuleListing" class="mdl last">'
    end   = '<div class="mdl">'
    divider = '<li id="vidlist_'
    show_id = r'mainuri="([^"]+)"'
    ep_type = r'<li class="list-ct">([^<]+)</li>'
    ep_name = r'itemprop="name" content="([^"]+)"'
    ep_numb = r'maintitle="([^"]+)"'
    ep_alt  = r'<li class="list-ep">([^<]+)</li>'
    ep_desc = r'<meta[^>]+?itemprop="description"[^>]+?content="([^"]+)"'
    ep_sear = r'\(Se[^) ]+s[^) ]+ ([0-9]+)\)'
    ep_ep   = r'Ep[^ 0-9]+ ([0-9]+)'
    episode_url = (base % all_series[series_name]).replace('series.jhtml', 'video.jhtml')
    episode_list = requests.get(episode_url).content.split(start,1)[-1].split(end,1)[0]
    episode_list = episode_list.replace('\n',' ').split(divider)[1:]
    full_eps = []
    # To decode HTML entities
    pars = HTMLParser.HTMLParser()
    for an_ep in episode_list:
        try:
            e_type = re.search(ep_type, an_ep).group(1)
        except:
            e_type = ''
        if e_type.lower().strip() == "full episode":
            e_id   = re.search(show_id, an_ep).group(1)
            e_desc = pars.unescape(re.search(ep_desc, an_ep).group(1))
            e_name = pars.unescape(re.search(ep_name, an_ep).group(1))
            e_numb = pars.unescape(re.search(ep_numb, an_ep).group(1))
            try:
                ep_s   = re.search(ep_sear, e_numb.split('|',1)[0]).group(1)
            except:
                ep_s   = ''
            try:
                ep_e   = re.search(ep_ep, e_numb.split('|',1)[-1].split('|',1)[0]).group(1)
            except:
                ep_e   = ''
            if not ep_e:
                # Only trust the episode information if it wasn't in the show title
                try:
                    ep_e   = ''.join([x for x in re.search(ep_alt, an_ep).group(1) if x in '0123456789'])
                except:
                    ep_e   = ''
            if ep_e:
                # There's at least a specific episode information, time to guess a show number
                f_ep = ('%02d' % int(ep_e))[-2:]
                f_se = ('%02d' % int(ep_e))[:-2]
                if f_se:
                    f_id = f_se + 'x' + f_ep
                elif ep_s:
                    f_id = ep_s + 'x' + f_ep
                else:
                    f_id = f_ep
            else:
                f_id = ''
            a_name = friendly_name(e_name, ep_s, f_id)
            full_eps.append(tuple([e_id, a_name, e_desc]))
    return full_eps

def get_resolutions(video_uri):
    # www.mtv.com/player/embed/AS3/rss/?uri=mgid:uma:videolist:mtv.com:1611381&ref={ref} HTTP/1.1
    # www.mtv.com/player/includes/mediaGen.jhtml?uri=mgid:uma:video:mtv.com:381464&id=1611381&vid=381464&ref={ref}
    video_id = video_uri.rsplit(':', 1)[-1]
    the_xml  = requests.get('http://www.mtv.com/player/embed/AS3/rss/?uri=%s&ref={ref}' % video_uri)
    dom = parseString(the_xml.content)
    first_section = dom.getElementsByTagName('guid')[0]
    clip_uri = first_section.childNodes[0].wholeText
    clip_vid = clip_uri.rsplit(':', 1)[-1]
    the_xml  = requests.get('http://www.mtv.com/player/includes/mediaGen.jhtml?uri=%s&id=%s&vid=%s&ref={ref}' % (clip_uri, video_id, clip_vid))
    dom = parseString(the_xml.content)
    renditions = dom.getElementsByTagName('rendition')
    return tuple([(x.getAttribute('bitrate'), x.getAttribute('width') + 'x' + x.getAttribute('height')) for x in renditions])

def get_episode_urls(video_uri, bitrate):
    video_id = video_uri.rsplit(':', 1)[-1]
    the_xml  = requests.get('http://www.mtv.com/player/embed/AS3/rss/?uri=%s&ref={ref}' % video_uri)
    dom = parseString(the_xml.content)
    sections = dom.getElementsByTagName('guid')
    results = []
    for section in sections:
        clip_uri = section.childNodes[0].wholeText
        clip_vid = clip_uri.rsplit(':', 1)[-1]
        the_xml  = requests.get('http://www.mtv.com/player/includes/mediaGen.jhtml?uri=%s&id=%s&vid=%s&ref={ref}' % (clip_uri, video_id, clip_vid))
        dom = parseString(the_xml.content)
        renditions = dom.getElementsByTagName('rendition')
        rendition  = [x for x in renditions if x.getAttribute('bitrate') == bitrate][0]
        media_end = rendition.getElementsByTagName('src')[0].childNodes[0].wholeText.split('/gsp.',1)[-1]
        url = 'http://mtvnmobile2.rd.llnwd.net/44620/mtvnorigin/gsp.' + media_end
        results.append(url)
    return results

def gen_downloads(episode_tuple, bitrate):
    video_uri, title, desc = episode_tuple
    for i, a_part in enumerate(get_episode_urls(video_uri, bitrate)):
        check_url = requests.get(a_part, allow_redirects=False)
        real_url  = check_url.headers['location']
        # /Users/mike/bin/axel -n 6 -u -o 'filename.mp4' 'url'
        filename = pipes.quote('%s - %s.mp4' % (''.join([x if (x in '-_.()\'| abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') else "_" for x in title]),i+1))
        _ = os.system('/Users/mike/bin/axel -n 6 -u -o %s %s' % (filename, pipes.quote(real_url)))
	import re, requests, HTMLParser, os, pipes
	from xml.dom.minidom import parseString

	global all_series

	def process_show_list(show_list):
	split_boundary = '<li class="group">'
	show_name = r'meta content="([^"]+)" itemprop="name"'
	show_url = r'<a itemprop="" href="([^"]+)">'
	found_shows = set()
	meta_data = show_list.replace('\n', ' ').split(split_boundary)[1:]
	for an_ep in meta_data:
	# print an_ep
	s_name = re.search(show_name, an_ep).group(1)
	try:
	s_url = re.search(show_url, an_ep).group(1)
	except:
	# Apparently lack of a show URL is allowed
	s_url = ''
	s_series = ['', 'series'][s_url.lower().endswith('series.jhtml')]
	found_shows.add(tuple([s_name, s_url, s_series]))
	return tuple(sorted(found_shows))

	def get_all_series():
	global all_series
	all_series = dict()
	source = 'http://www.mtv.com/ontv/all/?page=%s'
	start = '<ol class="lst ">'
	end = '</ol>'
	count = 0
	sanity_max = 20
	show_pages = set()
	keep_going = True
	while keep_going and (count < sanity_max):
	count += 1
	series_page = requests.get(source % count)
	show_list = series_page.content.split(start,1)[-1].split(end,1)[0]
	found_shows = process_show_list(show_list)
	if found_shows in show_pages:
	# We're repeating ourselves, time to exit
	keep_going = False
	else:
	show_pages.add(found_shows)
	total_shows = sorted(set([item for sublist in list(show_pages) for item in sublist]))
	for series in [x for x in total_shows if x[2] == "series"]:
	all_series[series[0]] = series[1]

	def friendly_name(e_name, e_s, e_id):
	if e_id:
	return '%s - %s' % (e_id, e_name)
	elif e_s:
	return 'Season %s - %s' % (e_s, e_name)
	return e_name

	def get_full_episodes(series_name):
	global all_series
	base = 'http://www.mtv.com%s'
	start = '<div id="videoModuleListing" class="mdl last">'
	end = '<div class="mdl">'
	divider = '<li id="vidlist_'
	show_id = r'mainuri="([^"]+)"'
	ep_type = r'<li class="list-ct">([^<]+)</li>'
	ep_name = r'itemprop="name" content="([^"]+)"'
	ep_numb = r'maintitle="([^"]+)"'
	ep_alt = r'<li class="list-ep">([^<]+)</li>'
	ep_desc = r'<meta[^>]+?itemprop="description"[^>]+?content="([^"]+)"'
	ep_sear = r'\(Se[^) ]+s[^) ]+ ([0-9]+)\)'
	ep_ep = r'Ep[^ 0-9]+ ([0-9]+)'
	episode_url = (base % all_series[series_name]).replace('series.jhtml', 'video.jhtml')
	episode_list = requests.get(episode_url).content.split(start,1)[-1].split(end,1)[0]
	episode_list = episode_list.replace('\n',' ').split(divider)[1:]
	full_eps = []
	# To decode HTML entities
	pars = HTMLParser.HTMLParser()
	for an_ep in episode_list:
	try:
	e_type = re.search(ep_type, an_ep).group(1)
	except:
	e_type = ''
	if e_type.lower().strip() == "full episode":
	e_id = re.search(show_id, an_ep).group(1)
	e_desc = pars.unescape(re.search(ep_desc, an_ep).group(1))
	e_name = pars.unescape(re.search(ep_name, an_ep).group(1))
	e_numb = pars.unescape(re.search(ep_numb, an_ep).group(1))
	try:
	ep_s = re.search(ep_sear, e_numb.split('\|',1)[0]).group(1)
	except:
	ep_s = ''
	try:
	ep_e = re.search(ep_ep, e_numb.split('\|',1)[-1].split('\|',1)[0]).group(1)
	except:
	ep_e = ''
	if not ep_e:
	# Only trust the episode information if it wasn't in the show title
	try:
	ep_e = ''.join([x for x in re.search(ep_alt, an_ep).group(1) if x in '0123456789'])
	except:
	ep_e = ''
	if ep_e:
	# There's at least a specific episode information, time to guess a show number
	f_ep = ('%02d' % int(ep_e))[-2:]
	f_se = ('%02d' % int(ep_e))[:-2]
	if f_se:
	f_id = f_se + 'x' + f_ep
	elif ep_s:
	f_id = ep_s + 'x' + f_ep
	else:
	f_id = f_ep
	else:
	f_id = ''
	a_name = friendly_name(e_name, ep_s, f_id)
	full_eps.append(tuple([e_id, a_name, e_desc]))
	return full_eps

	def get_resolutions(video_uri):
	# www.mtv.com/player/embed/AS3/rss/?uri=mgid:uma:videolist:mtv.com:1611381&ref={ref} HTTP/1.1
	# www.mtv.com/player/includes/mediaGen.jhtml?uri=mgid:uma:video:mtv.com:381464&id=1611381&vid=381464&ref={ref}
	video_id = video_uri.rsplit(':', 1)[-1]
	the_xml = requests.get('http://www.mtv.com/player/embed/AS3/rss/?uri=%s&ref={ref}' % video_uri)
	dom = parseString(the_xml.content)
	first_section = dom.getElementsByTagName('guid')[0]
	clip_uri = first_section.childNodes[0].wholeText
	clip_vid = clip_uri.rsplit(':', 1)[-1]
	the_xml = requests.get('http://www.mtv.com/player/includes/mediaGen.jhtml?uri=%s&id=%s&vid=%s&ref={ref}' % (clip_uri, video_id, clip_vid))
	dom = parseString(the_xml.content)
	renditions = dom.getElementsByTagName('rendition')
	return tuple([(x.getAttribute('bitrate'), x.getAttribute('width') + 'x' + x.getAttribute('height')) for x in renditions])

	def get_episode_urls(video_uri, bitrate):
	video_id = video_uri.rsplit(':', 1)[-1]
	the_xml = requests.get('http://www.mtv.com/player/embed/AS3/rss/?uri=%s&ref={ref}' % video_uri)
	dom = parseString(the_xml.content)
	sections = dom.getElementsByTagName('guid')
	results = []
	for section in sections:
	clip_uri = section.childNodes[0].wholeText
	clip_vid = clip_uri.rsplit(':', 1)[-1]
	the_xml = requests.get('http://www.mtv.com/player/includes/mediaGen.jhtml?uri=%s&id=%s&vid=%s&ref={ref}' % (clip_uri, video_id, clip_vid))
	dom = parseString(the_xml.content)
	renditions = dom.getElementsByTagName('rendition')
	rendition = [x for x in renditions if x.getAttribute('bitrate') == bitrate][0]
	media_end = rendition.getElementsByTagName('src')[0].childNodes[0].wholeText.split('/gsp.',1)[-1]
	url = 'http://mtvnmobile2.rd.llnwd.net/44620/mtvnorigin/gsp.' + media_end
	results.append(url)
	return results

	def gen_downloads(episode_tuple, bitrate):
	video_uri, title, desc = episode_tuple
	for i, a_part in enumerate(get_episode_urls(video_uri, bitrate)):
	check_url = requests.get(a_part, allow_redirects=False)
	real_url = check_url.headers['location']
	# /Users/mike/bin/axel -n 6 -u -o 'filename.mp4' 'url'
	filename = pipes.quote('%s - %s.mp4' % (''.join([x if (x in '-_.()\'\| abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') else "_" for x in title]),i+1))
	_ = os.system('/Users/mike/bin/axel -n 6 -u -o %s %s' % (filename, pipes.quote(real_url)))