Skip to content

Instantly share code, notes, and snippets.

@pudquick
Last active December 29, 2015 18:59
Show Gist options
  • Save pudquick/7714861 to your computer and use it in GitHub Desktop.
Save pudquick/7714861 to your computer and use it in GitHub Desktop.
import re, requests, HTMLParser, os, pipes
from xml.dom.minidom import parseString
global all_series
def process_show_list(show_list):
split_boundary = '<li class="group">'
show_name = r'meta content="([^"]+)" itemprop="name"'
show_url = r'<a itemprop="" href="([^"]+)">'
found_shows = set()
meta_data = show_list.replace('\n', ' ').split(split_boundary)[1:]
for an_ep in meta_data:
# print an_ep
s_name = re.search(show_name, an_ep).group(1)
try:
s_url = re.search(show_url, an_ep).group(1)
except:
# Apparently lack of a show URL is allowed
s_url = ''
s_series = ['', 'series'][s_url.lower().endswith('series.jhtml')]
found_shows.add(tuple([s_name, s_url, s_series]))
return tuple(sorted(found_shows))
def get_all_series():
global all_series
all_series = dict()
source = 'http://www.mtv.com/ontv/all/?page=%s'
start = '<ol class="lst ">'
end = '</ol>'
count = 0
sanity_max = 20
show_pages = set()
keep_going = True
while keep_going and (count < sanity_max):
count += 1
series_page = requests.get(source % count)
show_list = series_page.content.split(start,1)[-1].split(end,1)[0]
found_shows = process_show_list(show_list)
if found_shows in show_pages:
# We're repeating ourselves, time to exit
keep_going = False
else:
show_pages.add(found_shows)
total_shows = sorted(set([item for sublist in list(show_pages) for item in sublist]))
for series in [x for x in total_shows if x[2] == "series"]:
all_series[series[0]] = series[1]
def friendly_name(e_name, e_s, e_id):
if e_id:
return '%s - %s' % (e_id, e_name)
elif e_s:
return 'Season %s - %s' % (e_s, e_name)
return e_name
def get_full_episodes(series_name):
global all_series
base = 'http://www.mtv.com%s'
start = '<div id="videoModuleListing" class="mdl last">'
end = '<div class="mdl">'
divider = '<li id="vidlist_'
show_id = r'mainuri="([^"]+)"'
ep_type = r'<li class="list-ct">([^<]+)</li>'
ep_name = r'itemprop="name" content="([^"]+)"'
ep_numb = r'maintitle="([^"]+)"'
ep_alt = r'<li class="list-ep">([^<]+)</li>'
ep_desc = r'<meta[^>]+?itemprop="description"[^>]+?content="([^"]+)"'
ep_sear = r'\(Se[^) ]+s[^) ]+ ([0-9]+)\)'
ep_ep = r'Ep[^ 0-9]+ ([0-9]+)'
episode_url = (base % all_series[series_name]).replace('series.jhtml', 'video.jhtml')
episode_list = requests.get(episode_url).content.split(start,1)[-1].split(end,1)[0]
episode_list = episode_list.replace('\n',' ').split(divider)[1:]
full_eps = []
# To decode HTML entities
pars = HTMLParser.HTMLParser()
for an_ep in episode_list:
try:
e_type = re.search(ep_type, an_ep).group(1)
except:
e_type = ''
if e_type.lower().strip() == "full episode":
e_id = re.search(show_id, an_ep).group(1)
e_desc = pars.unescape(re.search(ep_desc, an_ep).group(1))
e_name = pars.unescape(re.search(ep_name, an_ep).group(1))
e_numb = pars.unescape(re.search(ep_numb, an_ep).group(1))
try:
ep_s = re.search(ep_sear, e_numb.split('|',1)[0]).group(1)
except:
ep_s = ''
try:
ep_e = re.search(ep_ep, e_numb.split('|',1)[-1].split('|',1)[0]).group(1)
except:
ep_e = ''
if not ep_e:
# Only trust the episode information if it wasn't in the show title
try:
ep_e = ''.join([x for x in re.search(ep_alt, an_ep).group(1) if x in '0123456789'])
except:
ep_e = ''
if ep_e:
# There's at least a specific episode information, time to guess a show number
f_ep = ('%02d' % int(ep_e))[-2:]
f_se = ('%02d' % int(ep_e))[:-2]
if f_se:
f_id = f_se + 'x' + f_ep
elif ep_s:
f_id = ep_s + 'x' + f_ep
else:
f_id = f_ep
else:
f_id = ''
a_name = friendly_name(e_name, ep_s, f_id)
full_eps.append(tuple([e_id, a_name, e_desc]))
return full_eps
def get_resolutions(video_uri):
# www.mtv.com/player/embed/AS3/rss/?uri=mgid:uma:videolist:mtv.com:1611381&ref={ref} HTTP/1.1
# www.mtv.com/player/includes/mediaGen.jhtml?uri=mgid:uma:video:mtv.com:381464&id=1611381&vid=381464&ref={ref}
video_id = video_uri.rsplit(':', 1)[-1]
the_xml = requests.get('http://www.mtv.com/player/embed/AS3/rss/?uri=%s&ref={ref}' % video_uri)
dom = parseString(the_xml.content)
first_section = dom.getElementsByTagName('guid')[0]
clip_uri = first_section.childNodes[0].wholeText
clip_vid = clip_uri.rsplit(':', 1)[-1]
the_xml = requests.get('http://www.mtv.com/player/includes/mediaGen.jhtml?uri=%s&id=%s&vid=%s&ref={ref}' % (clip_uri, video_id, clip_vid))
dom = parseString(the_xml.content)
renditions = dom.getElementsByTagName('rendition')
return tuple([(x.getAttribute('bitrate'), x.getAttribute('width') + 'x' + x.getAttribute('height')) for x in renditions])
def get_episode_urls(video_uri, bitrate):
video_id = video_uri.rsplit(':', 1)[-1]
the_xml = requests.get('http://www.mtv.com/player/embed/AS3/rss/?uri=%s&ref={ref}' % video_uri)
dom = parseString(the_xml.content)
sections = dom.getElementsByTagName('guid')
results = []
for section in sections:
clip_uri = section.childNodes[0].wholeText
clip_vid = clip_uri.rsplit(':', 1)[-1]
the_xml = requests.get('http://www.mtv.com/player/includes/mediaGen.jhtml?uri=%s&id=%s&vid=%s&ref={ref}' % (clip_uri, video_id, clip_vid))
dom = parseString(the_xml.content)
renditions = dom.getElementsByTagName('rendition')
rendition = [x for x in renditions if x.getAttribute('bitrate') == bitrate][0]
media_end = rendition.getElementsByTagName('src')[0].childNodes[0].wholeText.split('/gsp.',1)[-1]
url = 'http://mtvnmobile2.rd.llnwd.net/44620/mtvnorigin/gsp.' + media_end
results.append(url)
return results
def gen_downloads(episode_tuple, bitrate):
video_uri, title, desc = episode_tuple
for i, a_part in enumerate(get_episode_urls(video_uri, bitrate)):
check_url = requests.get(a_part, allow_redirects=False)
real_url = check_url.headers['location']
# /Users/mike/bin/axel -n 6 -u -o 'filename.mp4' 'url'
filename = pipes.quote('%s - %s.mp4' % (''.join([x if (x in '-_.()\'| abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') else "_" for x in title]),i+1))
_ = os.system('/Users/mike/bin/axel -n 6 -u -o %s %s' % (filename, pipes.quote(real_url)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment