u1735067/united-we-stream_dl.py

## united-we-stream_dl.py
#!python3

import sys, os, io, re, subprocess, threading, shutil, json


def youtubedl_fetch(url):
  # Like check_output, but keep stderr
  # https://github.com/python/cpython/blob/2.7/Lib/subprocess.py#L194
  cmdline = [
    './youtube-dl-2020.06.16.1',
    '--ffmpeg-location', 'ffmpeg-4.3-amd64-static/',
    '--prefer-free-formats',
    '--merge-output-format', 'mkv',
    '-f', '(bestvideo[ext=webm]/bestvideo)+(bestaudio[ext=webm]/bestaudio)/best',
    '--write-info-json',
    '--write-thumbnail',
    '--print-json',
    '--verbose',
    url
  ]
  stdout_buffer = io.BytesIO()
  print('> {}'.format(' '.join(cmdline)))
  try:
    process = subprocess.Popen(
      cmdline,
      #stdout=sys.stdout,
      stdout=subprocess.PIPE,
      stderr=subprocess.PIPE,
      cwd=os.getcwd(),
      env=os.environ.copy(),
      close_fds=True,
    )
    # https://stackoverflow.com/questions/42148113/alternative-to-subprocess-popen-communicate-for-streaming
    stdout_thread = threading.Thread(target=shutil.copyfileobj, args=(process.stdout, stdout_buffer))
    stdout_thread.start()
    for line in iter(process.stderr.readline, b''):
      sys.stdout.write(line.decode('utf-8', errors='replace'))
      sys.stdout.flush()
    #process.communicate()  # This wait for the process to terminate
    retcode = process.wait()
    stdout_thread.join()
  except KeyboardInterrupt:
    process.terminate()
    raise
  if retcode:  # Problem
    print('>> Error executing subprocess {}, rc={}'.format(cmdline, retcode))
  stdout = stdout_buffer.getvalue().decode('utf-8')
  print(stdout)
  try:
    return json.loads(stdout.split('\n')[0])
  except Exception as e:
    print('>> Error parsing JSON response: {}'.format(e))
    raise SystemExit(2)


def execute_command(cmd, args=[]):
  # Like check_output, but keep stderr
  # https://github.com/python/cpython/blob/2.7/Lib/subprocess.py#L194
  cmdline = [cmd] + args
  print('> {}'.format(' '.join(cmdline)))
  try:
    process = subprocess.Popen(
      cmdline,
      #stdout=sys.stdout,
      stdout=subprocess.PIPE,
      stderr=subprocess.STDOUT,
      cwd=os.getcwd(),
      env=os.environ.copy(),
      close_fds=True,
    )
    for line in iter(process.stdout.readline, b''):
      sys.stdout.write(line.decode('utf-8', errors='replace'))
      sys.stdout.flush()
    #process.communicate()  # This wait for the process to terminate
    retcode = process.wait()
  except KeyboardInterrupt:
    process.terminate()
    raise
  if retcode:  # Problem
    print('>> Error executing subprocess {}, rc={}'.format(cmdline, retcode))


collection_map = {
  '096844': {'name': 'United We Stream', 'location': 'DE'},
  '097457': {'name': 'United We Stream', 'location': 'DE'},
  '098237': {'name': 'United We Stream', 'location': 'DE'},
  '096844': {'name': 'United We Stream', 'location': 'DE'},  # Guests starting at 50
  '096905': {'name': 'United We Stream Global', 'location': ''},
  '098344': {'name': 'United We Stream Festival', 'location': ''},
  '100605': {'name': 'United We Stream November Lockdown Edition', 'location': ''},
  '098001': {'name': 'United We Stream Paris', 'location': 'FR'},
}

urls = []
with open(sys.argv[1], 'r') as in_list:
  for url in in_list:
    url = url.strip()
    if not url or url.startswith('#'):
      continue
    urls.append(url)

print('{} URLs to fetch'.format(len(urls)))

for i, url in enumerate(urls, start=1):
  print('--- ({}/{}) {}'.format(i, len(urls), url))
  meta_json = youtubedl_fetch(url)
  collection_id, position, _ = meta_json['display_id'].split('-')
  collection_name = collection_map[collection_id]['name']
  collection_location = collection_map[collection_id]['location']
  full_title = re.sub(r'\s{2,}', ' ', meta_json['fulltitle']).strip()

  track_id = None
  # Try to extract from name first, else use position
  track_from_name = re.search(r'#\s*(?P<track>[0-9]+)', full_title)
  track_id = track_from_name.group('track') if track_from_name is not None else position
  track_id = int(track_id)

  session_name = re.sub(r'United We Stream.+?(:|-|à|en|@)\s+', '', full_title)

  if collection_id == '096844' and track_id >= 50:
    title = '{} - {}'.format(collection_name, session_name)
  else:
    title = '{} #{} - {}'.format(collection_name, track_id, session_name)

  print('Input {}: {}'.format('display_id', meta_json['display_id']))
  print('Input {}: {}'.format('fulltitle', meta_json['fulltitle']))
  print('Output {}: {}'.format('collection_name', collection_name))
  print('Output {}: {}'.format('track_id', track_id))
  print('Output {}: {}'.format('session_name', session_name))
  print('Output {}: {}'.format('title', title))

  date = re.sub(r'(.{4})(.{2})(.{2})', r'\1-\2-\3', meta_json['upload_date'])

  in_filename = meta_json['_filename']
  out_filename = '{} [{}].mka'.format(title, meta_json['display_id'])

  ffmpeg_args = [
      '-loglevel', 'verbose',
      '-i', in_filename,
      '-c', 'copy',
      '-movflags', 'use_metadata_tags',
      '-map_metadata', '0',
      '-map_metadata:s:v', '0:s:v',
      '-map_metadata:s:a', '0:s:a',
      '-metadata', 'title={}'.format(title),
      '-metadata', 'track={}'.format(track_id),
      '-metadata', 'COLLECTION={}'.format(collection_name),
      '-metadata', 'DATE_RECORDED={}'.format(date),
  ]
  if collection_location:
    ffmpeg_args += ['-metadata', 'RECORDING_LOCATION={}'.format(collection_location)]
  ffmpeg_args += [
    '-metadata', 'DISTRIBUTED_BY={}'.format('Arte Concert'),
    '-metadata', 'CATALOG_NUMBER={}'.format(meta_json['display_id']),
    '-metadata', 'SESSION={}'.format(session_name),
  ]
  if meta_json.get('description', '') and meta_json.get('description', '').strip():
    ffmpeg_args += ['-metadata', 'DESCRIPTION={}'.format(meta_json.get('description', '').strip())]
  ffmpeg_args += [
    '-attach', in_filename.replace('.mp4', '.jpg'),
    '-metadata:s:t:0', 'filename=cover_land.jpg',
    '-metadata:s:t:0', 'mimetype=image/jpeg',
    '-metadata:s:t:0', 'title=Thumbnail',
    '-attach', in_filename.replace('.mp4', '.info.json'),
    '-metadata:s:t:1', 'filename=youtube-dl.info.json',
    '-metadata:s:t:1', 'mimetype=application/json',
    '-metadata:s:t:1', 'title="Youtube-dl info file"',
    out_filename
  ]
  execute_command(
    'ffmpeg-4.3-amd64-static/ffmpeg',
    ffmpeg_args
  )
  os.remove(in_filename)
  os.remove(in_filename.replace('.mp4', '.jpg'))
  os.remove(in_filename.replace('.mp4', '.info.json'))


'''
track_name =
./jq -r '.fulltitle | sub("\\s{2,}"; " ")'
./jq -r '.fulltitle'
./jq -r '.fulltitle | sub("United We Stream Festival\\s+((à|-|en|@)\\s+)?(?<session>.*)"; "\(.session)") | sub("\\s{2,}"; " ")')

session_name =
./jq -r '.fulltitle | sub(".*#\\s*[0-9]+\\s*(:|-)\\s*(?<session>.*)"; "\(.session)")
./jq -r '.fulltitle | sub("United We Stream\\s+((à|-|en)\\s+)?(?<session>.*)"; "\(.session)")
./jq -r '.fulltitle'
./jq -r '.fulltitle | sub("United We Stream\\s+((à|-)\\s+)?(?<session>.*)"; "\(.session)")
./jq -r '.fulltitle | sub("United We Stream Festival\\s+((à|-|en|@)\\s+)?(?<session>.*)"; "\(.session)")
./jq -r '.fulltitle | sub("United We Stream\\s+((à|-|en)\\s+)?(?<session>.*)"; "\(.session)")'
'''
	#!python3

	import sys, os, io, re, subprocess, threading, shutil, json


	def youtubedl_fetch(url):
	# Like check_output, but keep stderr
	# https://github.com/python/cpython/blob/2.7/Lib/subprocess.py#L194
	cmdline = [
	'./youtube-dl-2020.06.16.1',
	'--ffmpeg-location', 'ffmpeg-4.3-amd64-static/',
	'--prefer-free-formats',
	'--merge-output-format', 'mkv',
	'-f', '(bestvideo[ext=webm]/bestvideo)+(bestaudio[ext=webm]/bestaudio)/best',
	'--write-info-json',
	'--write-thumbnail',
	'--print-json',
	'--verbose',
	url
	]
	stdout_buffer = io.BytesIO()
	print('> {}'.format(' '.join(cmdline)))
	try:
	process = subprocess.Popen(
	cmdline,
	#stdout=sys.stdout,
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	cwd=os.getcwd(),
	env=os.environ.copy(),
	close_fds=True,
	)
	# https://stackoverflow.com/questions/42148113/alternative-to-subprocess-popen-communicate-for-streaming
	stdout_thread = threading.Thread(target=shutil.copyfileobj, args=(process.stdout, stdout_buffer))
	stdout_thread.start()
	for line in iter(process.stderr.readline, b''):
	sys.stdout.write(line.decode('utf-8', errors='replace'))
	sys.stdout.flush()
	#process.communicate() # This wait for the process to terminate
	retcode = process.wait()
	stdout_thread.join()
	except KeyboardInterrupt:
	process.terminate()
	raise
	if retcode: # Problem
	print('>> Error executing subprocess {}, rc={}'.format(cmdline, retcode))
	stdout = stdout_buffer.getvalue().decode('utf-8')
	print(stdout)
	try:
	return json.loads(stdout.split('\n')[0])
	except Exception as e:
	print('>> Error parsing JSON response: {}'.format(e))
	raise SystemExit(2)


	def execute_command(cmd, args=[]):
	# Like check_output, but keep stderr
	# https://github.com/python/cpython/blob/2.7/Lib/subprocess.py#L194
	cmdline = [cmd] + args
	print('> {}'.format(' '.join(cmdline)))
	try:
	process = subprocess.Popen(
	cmdline,
	#stdout=sys.stdout,
	stdout=subprocess.PIPE,
	stderr=subprocess.STDOUT,
	cwd=os.getcwd(),
	env=os.environ.copy(),
	close_fds=True,
	)
	for line in iter(process.stdout.readline, b''):
	sys.stdout.write(line.decode('utf-8', errors='replace'))
	sys.stdout.flush()
	#process.communicate() # This wait for the process to terminate
	retcode = process.wait()
	except KeyboardInterrupt:
	process.terminate()
	raise
	if retcode: # Problem
	print('>> Error executing subprocess {}, rc={}'.format(cmdline, retcode))


	collection_map = {
	'096844': {'name': 'United We Stream', 'location': 'DE'},
	'097457': {'name': 'United We Stream', 'location': 'DE'},
	'098237': {'name': 'United We Stream', 'location': 'DE'},
	'096844': {'name': 'United We Stream', 'location': 'DE'}, # Guests starting at 50
	'096905': {'name': 'United We Stream Global', 'location': ''},
	'098344': {'name': 'United We Stream Festival', 'location': ''},
	'100605': {'name': 'United We Stream November Lockdown Edition', 'location': ''},
	'098001': {'name': 'United We Stream Paris', 'location': 'FR'},
	}

	urls = []
	with open(sys.argv[1], 'r') as in_list:
	for url in in_list:
	url = url.strip()
	if not url or url.startswith('#'):
	continue
	urls.append(url)

	print('{} URLs to fetch'.format(len(urls)))

	for i, url in enumerate(urls, start=1):
	print('--- ({}/{}) {}'.format(i, len(urls), url))
	meta_json = youtubedl_fetch(url)
	collection_id, position, _ = meta_json['display_id'].split('-')
	collection_name = collection_map[collection_id]['name']
	collection_location = collection_map[collection_id]['location']
	full_title = re.sub(r'\s{2,}', ' ', meta_json['fulltitle']).strip()

	track_id = None
	# Try to extract from name first, else use position
	track_from_name = re.search(r'#\s*(?P<track>[0-9]+)', full_title)
	track_id = track_from_name.group('track') if track_from_name is not None else position
	track_id = int(track_id)

	session_name = re.sub(r'United We Stream.+?(:\|-\|à\|en\|@)\s+', '', full_title)

	if collection_id == '096844' and track_id >= 50:
	title = '{} - {}'.format(collection_name, session_name)
	else:
	title = '{} #{} - {}'.format(collection_name, track_id, session_name)

	print('Input {}: {}'.format('display_id', meta_json['display_id']))
	print('Input {}: {}'.format('fulltitle', meta_json['fulltitle']))
	print('Output {}: {}'.format('collection_name', collection_name))
	print('Output {}: {}'.format('track_id', track_id))
	print('Output {}: {}'.format('session_name', session_name))
	print('Output {}: {}'.format('title', title))

	date = re.sub(r'(.{4})(.{2})(.{2})', r'\1-\2-\3', meta_json['upload_date'])

	in_filename = meta_json['_filename']
	out_filename = '{} [{}].mka'.format(title, meta_json['display_id'])

	ffmpeg_args = [
	'-loglevel', 'verbose',
	'-i', in_filename,
	'-c', 'copy',
	'-movflags', 'use_metadata_tags',
	'-map_metadata', '0',
	'-map_metadata:s:v', '0:s:v',
	'-map_metadata:s:a', '0:s:a',
	'-metadata', 'title={}'.format(title),
	'-metadata', 'track={}'.format(track_id),
	'-metadata', 'COLLECTION={}'.format(collection_name),
	'-metadata', 'DATE_RECORDED={}'.format(date),
	]
	if collection_location:
	ffmpeg_args += ['-metadata', 'RECORDING_LOCATION={}'.format(collection_location)]
	ffmpeg_args += [
	'-metadata', 'DISTRIBUTED_BY={}'.format('Arte Concert'),
	'-metadata', 'CATALOG_NUMBER={}'.format(meta_json['display_id']),
	'-metadata', 'SESSION={}'.format(session_name),
	]
	if meta_json.get('description', '') and meta_json.get('description', '').strip():
	ffmpeg_args += ['-metadata', 'DESCRIPTION={}'.format(meta_json.get('description', '').strip())]
	ffmpeg_args += [
	'-attach', in_filename.replace('.mp4', '.jpg'),
	'-metadata:s:t:0', 'filename=cover_land.jpg',
	'-metadata:s:t:0', 'mimetype=image/jpeg',
	'-metadata:s:t:0', 'title=Thumbnail',
	'-attach', in_filename.replace('.mp4', '.info.json'),
	'-metadata:s:t:1', 'filename=youtube-dl.info.json',
	'-metadata:s:t:1', 'mimetype=application/json',
	'-metadata:s:t:1', 'title="Youtube-dl info file"',
	out_filename
	]
	execute_command(
	'ffmpeg-4.3-amd64-static/ffmpeg',
	ffmpeg_args
	)
	os.remove(in_filename)
	os.remove(in_filename.replace('.mp4', '.jpg'))
	os.remove(in_filename.replace('.mp4', '.info.json'))


	'''
	track_name =
	./jq -r '.fulltitle \| sub("\\s{2,}"; " ")'
	./jq -r '.fulltitle'
	./jq -r '.fulltitle \| sub("United We Stream Festival\\s+((à\|-\|en\|@)\\s+)?(?<session>.*)"; "\(.session)") \| sub("\\s{2,}"; " ")')

	session_name =
	./jq -r '.fulltitle \| sub(".#\\s[0-9]+\\s(:\|-)\\s(?<session>.*)"; "\(.session)")
	./jq -r '.fulltitle \| sub("United We Stream\\s+((à\|-\|en)\\s+)?(?<session>.*)"; "\(.session)")
	./jq -r '.fulltitle'
	./jq -r '.fulltitle \| sub("United We Stream\\s+((à\|-)\\s+)?(?<session>.*)"; "\(.session)")
	./jq -r '.fulltitle \| sub("United We Stream Festival\\s+((à\|-\|en\|@)\\s+)?(?<session>.*)"; "\(.session)")
	./jq -r '.fulltitle \| sub("United We Stream\\s+((à\|-\|en)\\s+)?(?<session>.*)"; "\(.session)")'
	'''