Last active
December 29, 2015 18:59
-
-
Save pudquick/7714861 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re, requests, HTMLParser, os, pipes | |
from xml.dom.minidom import parseString | |
global all_series | |
def process_show_list(show_list): | |
split_boundary = '<li class="group">' | |
show_name = r'meta content="([^"]+)" itemprop="name"' | |
show_url = r'<a itemprop="" href="([^"]+)">' | |
found_shows = set() | |
meta_data = show_list.replace('\n', ' ').split(split_boundary)[1:] | |
for an_ep in meta_data: | |
# print an_ep | |
s_name = re.search(show_name, an_ep).group(1) | |
try: | |
s_url = re.search(show_url, an_ep).group(1) | |
except: | |
# Apparently lack of a show URL is allowed | |
s_url = '' | |
s_series = ['', 'series'][s_url.lower().endswith('series.jhtml')] | |
found_shows.add(tuple([s_name, s_url, s_series])) | |
return tuple(sorted(found_shows)) | |
def get_all_series(): | |
global all_series | |
all_series = dict() | |
source = 'http://www.mtv.com/ontv/all/?page=%s' | |
start = '<ol class="lst ">' | |
end = '</ol>' | |
count = 0 | |
sanity_max = 20 | |
show_pages = set() | |
keep_going = True | |
while keep_going and (count < sanity_max): | |
count += 1 | |
series_page = requests.get(source % count) | |
show_list = series_page.content.split(start,1)[-1].split(end,1)[0] | |
found_shows = process_show_list(show_list) | |
if found_shows in show_pages: | |
# We're repeating ourselves, time to exit | |
keep_going = False | |
else: | |
show_pages.add(found_shows) | |
total_shows = sorted(set([item for sublist in list(show_pages) for item in sublist])) | |
for series in [x for x in total_shows if x[2] == "series"]: | |
all_series[series[0]] = series[1] | |
def friendly_name(e_name, e_s, e_id): | |
if e_id: | |
return '%s - %s' % (e_id, e_name) | |
elif e_s: | |
return 'Season %s - %s' % (e_s, e_name) | |
return e_name | |
def get_full_episodes(series_name): | |
global all_series | |
base = 'http://www.mtv.com%s' | |
start = '<div id="videoModuleListing" class="mdl last">' | |
end = '<div class="mdl">' | |
divider = '<li id="vidlist_' | |
show_id = r'mainuri="([^"]+)"' | |
ep_type = r'<li class="list-ct">([^<]+)</li>' | |
ep_name = r'itemprop="name" content="([^"]+)"' | |
ep_numb = r'maintitle="([^"]+)"' | |
ep_alt = r'<li class="list-ep">([^<]+)</li>' | |
ep_desc = r'<meta[^>]+?itemprop="description"[^>]+?content="([^"]+)"' | |
ep_sear = r'\(Se[^) ]+s[^) ]+ ([0-9]+)\)' | |
ep_ep = r'Ep[^ 0-9]+ ([0-9]+)' | |
episode_url = (base % all_series[series_name]).replace('series.jhtml', 'video.jhtml') | |
episode_list = requests.get(episode_url).content.split(start,1)[-1].split(end,1)[0] | |
episode_list = episode_list.replace('\n',' ').split(divider)[1:] | |
full_eps = [] | |
# To decode HTML entities | |
pars = HTMLParser.HTMLParser() | |
for an_ep in episode_list: | |
try: | |
e_type = re.search(ep_type, an_ep).group(1) | |
except: | |
e_type = '' | |
if e_type.lower().strip() == "full episode": | |
e_id = re.search(show_id, an_ep).group(1) | |
e_desc = pars.unescape(re.search(ep_desc, an_ep).group(1)) | |
e_name = pars.unescape(re.search(ep_name, an_ep).group(1)) | |
e_numb = pars.unescape(re.search(ep_numb, an_ep).group(1)) | |
try: | |
ep_s = re.search(ep_sear, e_numb.split('|',1)[0]).group(1) | |
except: | |
ep_s = '' | |
try: | |
ep_e = re.search(ep_ep, e_numb.split('|',1)[-1].split('|',1)[0]).group(1) | |
except: | |
ep_e = '' | |
if not ep_e: | |
# Only trust the episode information if it wasn't in the show title | |
try: | |
ep_e = ''.join([x for x in re.search(ep_alt, an_ep).group(1) if x in '0123456789']) | |
except: | |
ep_e = '' | |
if ep_e: | |
# There's at least a specific episode information, time to guess a show number | |
f_ep = ('%02d' % int(ep_e))[-2:] | |
f_se = ('%02d' % int(ep_e))[:-2] | |
if f_se: | |
f_id = f_se + 'x' + f_ep | |
elif ep_s: | |
f_id = ep_s + 'x' + f_ep | |
else: | |
f_id = f_ep | |
else: | |
f_id = '' | |
a_name = friendly_name(e_name, ep_s, f_id) | |
full_eps.append(tuple([e_id, a_name, e_desc])) | |
return full_eps | |
def get_resolutions(video_uri): | |
# www.mtv.com/player/embed/AS3/rss/?uri=mgid:uma:videolist:mtv.com:1611381&ref={ref} HTTP/1.1 | |
# www.mtv.com/player/includes/mediaGen.jhtml?uri=mgid:uma:video:mtv.com:381464&id=1611381&vid=381464&ref={ref} | |
video_id = video_uri.rsplit(':', 1)[-1] | |
the_xml = requests.get('http://www.mtv.com/player/embed/AS3/rss/?uri=%s&ref={ref}' % video_uri) | |
dom = parseString(the_xml.content) | |
first_section = dom.getElementsByTagName('guid')[0] | |
clip_uri = first_section.childNodes[0].wholeText | |
clip_vid = clip_uri.rsplit(':', 1)[-1] | |
the_xml = requests.get('http://www.mtv.com/player/includes/mediaGen.jhtml?uri=%s&id=%s&vid=%s&ref={ref}' % (clip_uri, video_id, clip_vid)) | |
dom = parseString(the_xml.content) | |
renditions = dom.getElementsByTagName('rendition') | |
return tuple([(x.getAttribute('bitrate'), x.getAttribute('width') + 'x' + x.getAttribute('height')) for x in renditions]) | |
def get_episode_urls(video_uri, bitrate): | |
video_id = video_uri.rsplit(':', 1)[-1] | |
the_xml = requests.get('http://www.mtv.com/player/embed/AS3/rss/?uri=%s&ref={ref}' % video_uri) | |
dom = parseString(the_xml.content) | |
sections = dom.getElementsByTagName('guid') | |
results = [] | |
for section in sections: | |
clip_uri = section.childNodes[0].wholeText | |
clip_vid = clip_uri.rsplit(':', 1)[-1] | |
the_xml = requests.get('http://www.mtv.com/player/includes/mediaGen.jhtml?uri=%s&id=%s&vid=%s&ref={ref}' % (clip_uri, video_id, clip_vid)) | |
dom = parseString(the_xml.content) | |
renditions = dom.getElementsByTagName('rendition') | |
rendition = [x for x in renditions if x.getAttribute('bitrate') == bitrate][0] | |
media_end = rendition.getElementsByTagName('src')[0].childNodes[0].wholeText.split('/gsp.',1)[-1] | |
url = 'http://mtvnmobile2.rd.llnwd.net/44620/mtvnorigin/gsp.' + media_end | |
results.append(url) | |
return results | |
def gen_downloads(episode_tuple, bitrate): | |
video_uri, title, desc = episode_tuple | |
for i, a_part in enumerate(get_episode_urls(video_uri, bitrate)): | |
check_url = requests.get(a_part, allow_redirects=False) | |
real_url = check_url.headers['location'] | |
# /Users/mike/bin/axel -n 6 -u -o 'filename.mp4' 'url' | |
filename = pipes.quote('%s - %s.mp4' % (''.join([x if (x in '-_.()\'| abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') else "_" for x in title]),i+1)) | |
_ = os.system('/Users/mike/bin/axel -n 6 -u -o %s %s' % (filename, pipes.quote(real_url))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment