Skip to content

Instantly share code, notes, and snippets.

@dreness dreness/owl.py
Last active Jan 12, 2018

Embed
What would you like to do?
Reveal OverwatchLeague video URLs
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
Work in progress. For now, just show URLs to m3u8 files of match videos.
To use, first do:
pip install -r requirements.txt
TODO: pre-season VODs were split out by game; season1 VODs contain all games in a match
"""
from __future__ import unicode_literals
from os import getcwd
import re
import sys
import json
import attr
from textwrap import dedent
import argparse
from pprint import pprint as pp
from datetime import datetime, timedelta
import youtube_dl
import requests
import validators
import cachecontrol
from bs4 import BeautifulSoup
@attr.s
class OWGame(object):
"""
A single game of Overwatch. For us, this represents everything that happens
in a single VOD file on overwatchleague.com.
Args:
matchparser (OWMatchParser): reference to the match parser
parentmatch (OWMatch): reference to the match containing this game
gamedesc (str): display description
gameslug (str): short string that is unique relative to siblings
date (datetime): date and time of posting
m3u8 (str): url to this game's m3u8 file that yields the video.
pageurl (str): url to the web page that holds the m3u8
duration (datetime): duration of game
"""
# pylint: disable=too-many-instance-attributes
# pylint: disable=too-many-arguments
matchparser = attr.ib()
parentmatch = attr.ib()
gamedesc = attr.ib()
gameslug = attr.ib(default=None)
date = attr.ib(default=None)
m3u8 = attr.ib(default=None)
pageurl = attr.ib(default=None)
duration = attr.ib(default=None)
def __str__(self):
return str(self.gamedesc)
def find_stream_url(self):
"""
The HLS stream URL for a game is found on the game's detail page.
Load the page, scrape the URL, store it as m3u8.
"""
vid_r = self.matchparser.sess.get(self.pageurl)
vid_soup = BeautifulSoup(vid_r.text, 'html.parser')
# The m3u8 url is specified in one of the javascripts
scripts = vid_soup.find_all("script")
for script in [x.text for x in scripts]:
for line in script.split("\n"):
vals = line.split(" = ")[::1]
for val in [s.strip("\n").strip(";") for s in vals]:
if "m3u8" in val:
js = json.loads(val)
self.m3u8 = "http:" + js['streams'][0]['streamUrl']
if not validators.url(self.m3u8):
print("Failed to identify HLS URL for {}!".format(
self.gameslug))
def download(self):
"""Download a video with youtube-dl"""
# output file path and filename
ytdl_out = "{}/{}/{}.%(ext)s".format(
self.matchparser.args.directory,
self.parentmatch.matchslug,
self.gameslug,
)
opts = dict(outtmpl=ytdl_out, format='bestaudio/best', logger=YTDLLogger(), progress_hooks=[ytdl_hook],
forceurl=True, forcefilename=True, restrictfilenames=True, nooverwrites=True,
merge_output_format="mp4")
with youtube_dl.YoutubeDL(opts) as ydl:
ydl.download([self.m3u8])
@attr.s
class OWMatch(object):
"""
An OWMatch is a group of OWGames in a 'best of' series
:param matchurl: url to the overview page for this match
:type matchurl: str
:param games: series of games that comprise this match
:type games: list
"""
# pylint: disable=too-many-arguments
matchparser = attr.ib()
matchslug = attr.ib(default=None)
matchurl = attr.ib(default=None)
matchdesc = attr.ib(default=None)
games = attr.ib(default=attr.Factory(list))
matchdiv = attr.ib(default=None)
@property
def duration(self):
"""Sum up the durations of all games in this match"""
delta = timedelta(0)
for game in self.games:
delta = game.duration + delta
return delta
def __str__(self):
return str(self.matchslug)
def __len__(self):
return len(self.games)
def get_game_details(self):
"""Get the game details that can be gleaned from the overview page"""
for mdiv in self.matchdiv:
gamedesc = mdiv.find('a').get("data-title").strip()
game = OWGame(
matchparser=self.matchparser,
parentmatch=self,
gamedesc=gamedesc)
game.pageurl = mdiv.find('a').get("data-mlg-embed")
game.date = mdiv.find('a').get("data-date")
parsed_dt = datetime.strptime(mdiv.span.text, "%M:%S")
delta = timedelta(
hours=parsed_dt.hour, minutes=parsed_dt.minute, seconds=parsed_dt.second)
game.duration = delta
reg = re.match(r".*?Game (?P<game_number>\d)", gamedesc)
if reg:
game_number = reg.groups()[0]
game.gameslug = "{}-game-{}".format(self.matchslug, game_number)
else:
print("Failed to parse game number for {}!".format(gamedesc))
self.games.append(game)
@attr.s
class OWMatchparser(object):
"""
Scrape the overwatchleague video catalog page to identify each OWMatch
:param args: cli args dictionary
:type args: dict
:param matches: matches parsed by this parser
:type matches: dict
:param sess: CacheControl http client session
:type args: cachecontrol.CacheControl() object
:returns: {matchslug: OWMatch, ...}
:rtype: dict
Each match is named in an <h3> tag on the catalog page, which we'll use as
OWMatch.matchslug. The catalog also probably contains a bag of other non-match
videos, under an <h3> whose text is "VIDEOS". Filter that out
"""
args = attr.ib()
matches = attr.ib(default=attr.Factory(dict))
sess = attr.ib(default=cachecontrol.CacheControl(requests.Session()))
def __iter__(self):
return iter(self.matches)
def __contains__(self, value):
return value in self.matches
def __len__(self):
return len(self.matches)
def __get__(self, key):
return self.matches[key]
def get_match_overviews(self):
"""Load the top-level archive page, detect matches"""
req = self.sess.get(self.args.url)
if req.status_code != 200:
print("Failed to access args.url: {}".format(self.args.url))
sys.exit(1)
if not req.text:
print("Got a response for {}, it's empty!?".format(self.args.url))
soup = BeautifulSoup(req.text, 'html.parser')
# All the stuff we want is under soup.section.div
div = soup.section.div
# Select tags whose class includes "UnderlineAnimation" and that have
# an attribute called "data-mlg-embed" (the value is a url to a video page)
# tiles are the rectangular elements that represent OWGames.
# tile_regex = re.compile(r"UnderlineAnimation")
# attr_filter = {"data-mlg-embed": True, "class": tile_regex}
# tiles = div.find_all(attrs=attr_filter, recursive=True)
def match_selector(tag):
"""Implement a predicate only slightly too complicated to be in-line"""
return tag.name == "h3" and tag.text != "VIDEOS"
# For each match, obtain the corresponding div of OWGames
for h3_div in div.find_all(match_selector, recursive=False):
matchslug = h3_div.nextSibling['data-analytics-placement']
if matchslug is None:
print("Couldn't find matchslug for match {}!".format(h3_div))
sys.exit(1)
matchdiv = h3_div.nextSibling
if matchdiv is None:
print("Couldn't get div for match {}".format(h3_div))
sys.exit(1)
matchdesc = h3_div.text
self.matches[matchslug] = OWMatch(
matchparser=self,
matchslug=matchslug,
matchdiv=matchdiv,
matchdesc=matchdesc)
return self.matches
@property
def games(self, slug=None):
"""Return the game for a gameslug, all games in a match for a matchslug, or all games"""
games = []
for match in self.matches.values():
if slug:
if match.matchslug == slug:
games.append(match.games)
for game in match.games:
if game.gameslug == slug:
games.append(game)
else:
games.append([game for game in match.games])
return games
class YTDLLogger(object):
"""Customize YTDL output?"""
# pylint: disable=missing-docstring
def debug(self, msg):
pass
def warning(self, msg):
pass
@staticmethod
def error(msg):
print(msg)
def ytdl_hook(download):
"""YTDL status callbacks"""
if download['status'] == 'finished':
print('Done downloading, now converting ...')
if download['status'] == 'downloading':
msg = "Downloading {} for {} seconds with {} of {} bytes. ETA: {}".format(
download.filename, download.elapsed, download.downloaded_bytes,
download.total_bytes, download.eta)
sys.stdout.write(msg)
sys.stdout.write("\033[K")
def main():
"""Start here for interactive use"""
parser = argparse.ArgumentParser(
description='Download overwatchleague videos', )
parser.add_argument(
"-d",
"--directory",
type=str,
help="Path to download directory (cwd by default)",
default=getcwd(),
)
parser.add_argument(
"-y",
"--youtube-dl",
type=str,
default="youtube-dl",
help="Path to youtube-dl (find in $PATH by default)",
)
parser.add_argument(
"-u",
"--url",
type=str,
default="https://overwatchleague.com/en-us/videos",
help=dedent('\n'
' URL to overwatchleague video listing page.\n'
' Default: https://overwatchleague.com/en-us/videos\n'
' '),
)
parser.add_argument(
"--dry-run",
action='store_true',
help="Don't download videos",
)
parser.add_argument(
"-g",
"--get-item",
type=str,
help="Download a specific game by slug",
)
parser.add_argument(
"-l",
"--list-matches",
action='store_true',
default=True,
help="List match metadata",
)
parser.add_argument(
"-i",
"--interactive",
action='store_true',
default=False,
help="Enter IPython console before exit",
)
parser.add_argument(
"-v",
"--list-videos",
action='store_true',
default=True,
help="List video metadata",
)
args = parser.parse_args()
# This would be an example of what you'd do if importing this as a module
# I could go straight for a results dict with:
# matches = OWMatchparser(args).get_match_overviews()
# ... but instead store a ref to the match parser to use its methods later
matchparser = OWMatchparser(args)
matches = matchparser.get_match_overviews()
print("Found {} matches.".format(len(matches)))
for match in matches.values():
print("Getting game metadata for match: {}".format(match))
match.get_game_details()
for game in match.games:
print("Getting stream URL for game: {}".format(game))
game.find_stream_url()
print("\n")
for slug, match in matches.items():
print("{} - {}".format(slug, match.matchdesc))
for game in match.games:
pp(game.m3u8)
print("\n")
# import pdb ; pdb.set_trace()
if args.get_item:
slug = args.get_item.strip("'").strip('"')
targets = matchparser.games
for game in targets:
game.download()
if args.interactive:
from ptpython.repl import embed
embed(globals(), locals())
if __name__ == "__main__":
main()
# sample m3u8 url:
# https://mlgmsod-pipeline.akamaized.net/media/production/delivery/73/13/7313aade-cef9-4fff-8021-39e7bed05bda/WOMhqfusMkc_9632a417-8a39-43bb-9d71-5892cdfc4c81_4800k.m3u8
youtube_dl==2017.12.14
validators==0.12.0
requests==2.18.4
requests_cache==0.4.13
ptpython==0.41
attrs==17.3.0
beautifulsoup4==4.6.0
cachecontrol==0.12.3
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.