Skip to content

Instantly share code, notes, and snippets.

@JeanOlivier
Created March 19, 2022 05:13
Show Gist options
  • Save JeanOlivier/346cfcfdbc92c94a462fcb4017b481ac to your computer and use it in GitHub Desktop.
Save JeanOlivier/346cfcfdbc92c94a462fcb4017b481ac to your computer and use it in GitHub Desktop.
March Meeting 2022 Live talks downloader. Procedure + code to download the live talks once they're available on demand. Might be useful for many similar livestreams with non-predictable chunk filenames.
#!/usr/env python3
# -*- coding: utf-8 -*-
import os
import json
import subprocess
from base64 import b64decode
from haralyzer import HarParser, HarPage
from dateutil import parser as dateparser
"""
MM2022 video downloader
0. In Firefox: set "devtools.netmonitor.responseBodyLimit" to 0 in "about:config"
1. Load talk page in firefox, don't start the stream
2. Open the dev tools in firefox with ctrl+shift+c, browse to the network tab, set "Persist Logs" to True in the network tab settings.
3. Start the stream and quickly set the resolution to the desired one (e.g. 720p)
4. Play the whole talk (you can do that on 2x and on mute)
5. Once it's done:
1. Search the network tab for "m4a"
2. Right-click the first file ---> Save All As HAR
3. Save as "*session*_audio.har"
6. Then:
1. Search the network tab for "m4v"
2. Right-click the first file ---> Save All As HAR
3. Save as "*session*_video.har"
7. Execute `extract_session(*session*)`
"""
def parse_har_file(har_file, output_file):
# Parsing HAR file dump
with open(har_file, 'r') as f:
har_parser = HarParser(json.loads(f.read()))
# The actual data
data = har_parser.har_data
# Sorting entries by the time at which they were downloaded
sorted_entries = sorted(data['entries'], key=lambda e: dateparser.parse(e['startedDateTime']))
# Writing each chunk to file sequentially
with open(output_file, 'wb') as f:
for e in sorted_entries:
tmp = b64decode(e['response']['content']['text'])
chunk_header_skip = tmp.find(b'ftyp') - 4 # They added their custom header, let's strip that.
chunk_data = tmp[chunk_header_skip:]
f.write(chunk_data)
def merge_audio_video(session_name, cleanup=True):
# Building the command to merge audio and video.
# We copy; we don't reencode. This is both faster and lossless compared to a reencode.
# Note that the resolution might change within the video. VLC handles that well, other players are untested.
cmd = f"ffmpeg -i {session_name}_video.m4v -i {session_name}_audio.m4a -c:v copy -c:a copy {session_name}_Final.mkv"
# Merging using FFMPEG
try:
subprocess.check_output(cmd.split(" "))
# In case of error, let's tell the user to do it themselves
except subprocess.CalledProcessError as e:
print(f"\nAutomatic merging of audio and video failed.\nYou should now merge the audio and video manually, e.g. using FFMPEG:\n {cmd}")
if cleanup: # Removing intermediate files we can easily recreate
for t in ["audio", "video"]:
os.remove(f"{session_name}_{t}.m4{t[0]}")
def extract_session(session_name, merge_av=True, *args, **kwargs):
# The naming convention is `{session_name}_{type}.har` with `type` either `audio` or `video`.
har_audio = f'{session_name}_audio.har'
out_audio = f'{session_name}_audio.m4a'
har_video = f'{session_name}_video.har'
out_video = f'{session_name}_video.m4v'
parse_har_file(har_audio, out_audio)
parse_har_file(har_video, out_video)
if merge_av:
merge_audio_video(session_name, *args, **kwargs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment