JeanOlivier/MM2022_live_talks_downloader.py

## MM2022_live_talks_downloader.py
#!/usr/env python3
# -*- coding: utf-8 -*-

import os
import json
import subprocess

from base64 import b64decode
from haralyzer import HarParser, HarPage
from dateutil import parser as dateparser

"""
MM2022 video downloader

0. In Firefox: set "devtools.netmonitor.responseBodyLimit" to 0 in "about:config"
1. Load talk page in firefox, don't start the stream
2. Open the dev tools in firefox with ctrl+shift+c, browse to the network tab, set "Persist Logs" to True in the network tab settings.
3. Start the stream and quickly set the resolution to the desired one (e.g. 720p)
4. Play the whole talk (you can do that on 2x and on mute)
5. Once it's done:
    1. Search the network tab for "m4a"
    2. Right-click the first file ---> Save All As HAR
    3. Save as "*session*_audio.har"
6. Then:
    1. Search the network tab for "m4v"
    2. Right-click the first file ---> Save All As HAR
    3. Save as "*session*_video.har"
7. Execute `extract_session(*session*)`
"""


def parse_har_file(har_file, output_file):
    # Parsing HAR file dump
    with open(har_file, 'r') as f:
        har_parser = HarParser(json.loads(f.read()))

    # The actual data
    data = har_parser.har_data
    # Sorting entries by the time at which they were downloaded
    sorted_entries = sorted(data['entries'], key=lambda e: dateparser.parse(e['startedDateTime']))

    # Writing each chunk to file sequentially
    with open(output_file, 'wb') as f:
        for e in sorted_entries:
            tmp = b64decode(e['response']['content']['text'])
            chunk_header_skip = tmp.find(b'ftyp') - 4 # They added their custom header, let's strip that.
            chunk_data = tmp[chunk_header_skip:]
            f.write(chunk_data)


def merge_audio_video(session_name, cleanup=True):
    # Building the command to merge audio and video.
    # We copy; we don't reencode. This is both faster and lossless compared to a reencode.
    # Note that the resolution might change within the video. VLC handles that well, other players are untested.
    cmd = f"ffmpeg -i {session_name}_video.m4v -i {session_name}_audio.m4a -c:v copy -c:a copy {session_name}_Final.mkv"
    # Merging using FFMPEG
    try:
        subprocess.check_output(cmd.split(" "))
    # In case of error, let's tell the user to do it themselves
    except subprocess.CalledProcessError as e:
        print(f"\nAutomatic merging of audio and video failed.\nYou should now merge the audio and video manually, e.g. using FFMPEG:\n    {cmd}")

    if cleanup: # Removing intermediate files we can easily recreate
        for t in ["audio", "video"]:
            os.remove(f"{session_name}_{t}.m4{t[0]}")


def extract_session(session_name, merge_av=True, *args, **kwargs):
    # The naming convention is `{session_name}_{type}.har` with `type` either `audio` or `video`.

    har_audio = f'{session_name}_audio.har'
    out_audio = f'{session_name}_audio.m4a'

    har_video = f'{session_name}_video.har'
    out_video = f'{session_name}_video.m4v'

    parse_har_file(har_audio, out_audio)
    parse_har_file(har_video, out_video)

    if merge_av:
        merge_audio_video(session_name, *args, **kwargs)
	#!/usr/env python3
	# -- coding: utf-8 --

	import os
	import json
	import subprocess

	from base64 import b64decode
	from haralyzer import HarParser, HarPage
	from dateutil import parser as dateparser

	"""
	MM2022 video downloader

	0. In Firefox: set "devtools.netmonitor.responseBodyLimit" to 0 in "about:config"
	1. Load talk page in firefox, don't start the stream
	2. Open the dev tools in firefox with ctrl+shift+c, browse to the network tab, set "Persist Logs" to True in the network tab settings.
	3. Start the stream and quickly set the resolution to the desired one (e.g. 720p)
	4. Play the whole talk (you can do that on 2x and on mute)
	5. Once it's done:
	1. Search the network tab for "m4a"
	2. Right-click the first file ---> Save All As HAR
	3. Save as "session_audio.har"
	6. Then:
	1. Search the network tab for "m4v"
	2. Right-click the first file ---> Save All As HAR
	3. Save as "session_video.har"
	7. Execute `extract_session(session)`
	"""


	def parse_har_file(har_file, output_file):
	# Parsing HAR file dump
	with open(har_file, 'r') as f:
	har_parser = HarParser(json.loads(f.read()))

	# The actual data
	data = har_parser.har_data
	# Sorting entries by the time at which they were downloaded
	sorted_entries = sorted(data['entries'], key=lambda e: dateparser.parse(e['startedDateTime']))

	# Writing each chunk to file sequentially
	with open(output_file, 'wb') as f:
	for e in sorted_entries:
	tmp = b64decode(e['response']['content']['text'])
	chunk_header_skip = tmp.find(b'ftyp') - 4 # They added their custom header, let's strip that.
	chunk_data = tmp[chunk_header_skip:]
	f.write(chunk_data)


	def merge_audio_video(session_name, cleanup=True):
	# Building the command to merge audio and video.
	# We copy; we don't reencode. This is both faster and lossless compared to a reencode.
	# Note that the resolution might change within the video. VLC handles that well, other players are untested.
	cmd = f"ffmpeg -i {session_name}_video.m4v -i {session_name}_audio.m4a -c:v copy -c:a copy {session_name}_Final.mkv"
	# Merging using FFMPEG
	try:
	subprocess.check_output(cmd.split(" "))
	# In case of error, let's tell the user to do it themselves
	except subprocess.CalledProcessError as e:
	print(f"\nAutomatic merging of audio and video failed.\nYou should now merge the audio and video manually, e.g. using FFMPEG:\n {cmd}")

	if cleanup: # Removing intermediate files we can easily recreate
	for t in ["audio", "video"]:
	os.remove(f"{session_name}_{t}.m4{t[0]}")


	def extract_session(session_name, merge_av=True, args, *kwargs):
	# The naming convention is `{session_name}_{type}.har` with `type` either `audio` or `video`.

	har_audio = f'{session_name}_audio.har'
	out_audio = f'{session_name}_audio.m4a'

	har_video = f'{session_name}_video.har'
	out_video = f'{session_name}_video.m4v'

	parse_har_file(har_audio, out_audio)
	parse_har_file(har_video, out_video)

	if merge_av:
	merge_audio_video(session_name, args, *kwargs)