oozoofrog/wwdc2024.py

## wwdc2024.py
import sys
import requests
import yt_dlp
import openai

def get_best_format(m3u8_url):
    ydl_opts = {
        'listformats': True,
    }

    formats = []
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        result = ydl.extract_info(m3u8_url, download=False)
        formats = result.get('formats', [])

    # Find the best quality format based on resolution, FPS, and TBR
    best_format = max(formats, key=lambda x: (
        x.get('height', 0),
        x.get('fps', 0),
        x.get('tbr', 0)
    ))
    return best_format['format_id']

def get_webvtt_format(m3u8_url):
    ydl_opts = {
        'listformats': True,
    }

    formats = []
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        result = ydl.extract_info(m3u8_url, download=False)
        formats = result.get('formats', [])

    return formats[0]['format_id']

def fetch_informations(wwdc_url):
    response = requests.get(wwdc_url)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch WWDC page: {response.status_code}")
    # print(response.text)
    # Parse the page content to find video and subtitle URLs
    # This is a basic example and may need adjustments based on the actual HTML structure
    content = response.text
    # content id is last path of the url
    content_id = wwdc_url.split('/')[-1]
    content_uuid = ""
    title = ""
    video_url = ""
    subtitle_url = ""
    if '<title>' in content:
        title = content.split('<title>')[1].split('</title>')[0]
    # find hd mp4 link
    if '.mp4' in content:
        video_url = content.split('hd.mp4')[0].split('"')[-1] + "hd.mp4"
        content_uuid = video_url.split('/')[-3]

    # subtitle url sample https://devstreaming-cdn.apple.com/videos/wwdc/2024/10118/4/16FC914B-F442-41A4-AFF4-5047A3FF7125/subtitles/eng/prog_index.m3u8
    # video url sample https://devstreaming-cdn.apple.com/videos/wwdc/2024/10171/4/7E1A626A-DE4F-4DEB-A2D9-ECCAAD10A34F/downloads/wwdc2024-10171_hd.mp4
    subtitle_url = video_url.split('downloads')[0] + 'subtitles/eng/prog_index.m3u8'

    if not video_url:
        raise Exception("Could not find video in the page content")

    return content_id, content_uuid, title, video_url, subtitle_url

def download_video(video_url, output_filename):
    # get best format for video
    # format_id = get_best_format(video_url)
    ydl_opts = {
        # format is the best format id
        # 'format': format_id,
        'format': 'best',
        # output file is content_id_title.extension
        # title is all lowercaseed and spaces are replaced with underscores
        'outtmpl': f'{output_filename}.%(ext)s',
        'overwrites': True,
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        # ydl.download([video_url])
        # download video and return destination file
        info_dict = ydl.extract_info(video_url, download=True)
        filepath = info_dict['requested_downloads'][0]['filepath']
        print(f"Downloaded video to: {filepath}")
        return filepath

def download_subtitle(m3u8_url, output_filename):
    import os
    import subprocess
    # ffmpeg download subtitle
    output_file = output_filename + "_en.srt"
    output_path = os.path.join(os.getcwd(), output_file)
    command = [
        'ffmpeg',
        '-i', m3u8_url,
        output_file,
        # force to overwrite
        '-y'
    ]
    subprocess.run(command, check=True)
    print(f"Downloaded subtitle to: {output_path}")
    return output_path

def download_video_and_subtitle(opts, video_url, subtitle_url, output_filename):
    downloaded_video_url = None
    downloaded_subtitle_url = None
    # if opts not none
    if opts:
        if opts == "--video":
            downloaded_video_url = download_video(video_url, output_filename)
        elif opts == "--subtitle":
            downloaded_subtitle_url = download_subtitle(subtitle_url, output_filename)
        else:
            downloaded_video_url = download_video(video_url, output_filename)
            downloaded_subtitle_url = download_subtitle(subtitle_url, output_filename)
    else:
        downloaded_video_url = download_video(video_url, output_filename)
        downloaded_subtitle_url = download_subtitle(subtitle_url, output_filename)
    return downloaded_video_url, downloaded_subtitle_url

def merge_video_subtitle(video_file, subtitle_file, subtitle_ko_file):
    import subprocess
    # ffmpeg -i input.mp4 -i input.srt -c copy -c:s mov_text output.mp4
    output_file = video_file.split('.')[0] + '_subtitled.mp4'

    if subtitle_ko_file:
        command = [
            'ffmpeg',
            '-i', video_file,
            '-i', subtitle_file,
            '-i', subtitle_ko_file,
            '-c:v', 'copy',
            '-c:a', 'copy',
            '-c:s', 'mov_text',  # Use mov_text codec for subtitles
            '-metadata:s:s:0', 'language=eng',
            '-metadata:s:s:1', 'language=kor',
            output_file,
            '-y'
        ]
    else:
        command = [
            'ffmpeg',
            '-i', video_file,
            '-i', subtitle_file,
            '-c:v', 'copy',
            '-c:a', 'copy',
            '-c:s', 'mov_text',  # Use mov_text codec for subtitles
            '-metadata:s:s:0', 'language=eng',
            output_file
        ]
    subprocess.run(command, check=True)
    print(f"Subtitled video saved to: {output_file}")

def translation_subtitle_to_ko(openai_key, subtitle_path):
    from openai import OpenAI
    translated_subtitle_path = subtitle_path.replace('_en.srt', '_ko.srt')

    # invalid of openai key just announcement
    if not openai_key or openai_key == "":
        print("OpenAI key is not valid. ignore translation.")
        return ""
    with open(subtitle_path, encoding='utf-8') as file:
        subtitles_str = file.read()

    structured_subtitles = parse_srt(subtitles_str)
    client = OpenAI(api_key=openai_key)

    translated = ""
    subtitles = []
    total = len(structured_subtitles)
    progress = 0
    for subtitle_unit in structured_subtitles:
        subtitles.append(subtitle_unit)
        # if lenth of subtitles is 10, start translation
        if len(subtitles) % 10 == 0:
            content = translation_subtitle_to_openai(client, subtitles)
            # trim ``` or ```srt
            content = content.replace('```', '')
            content = content.replace('```srt', '')
            # trimming spaces all lines
            content = '\n'.join([line.strip() for line in content.split('\n')])
            translated += content + "\n\n"
            progress += len(subtitles)
            subtitles = []
            print(f"Translated {progress}/{total} subtitles")
    if len(subtitles) > 0:
        # trim ``` or ```srt
        content = content.replace('```', '')
        content = content.replace('```srt', '')
        # trimming spaces all lines
        content = '\n'.join([line.strip() for line in content.split('\n')])
        translated += content + "\n\n"
        progress += len(subtitles)
        subtitles = []
        print(f"Translated {progress}/{total} subtitles")

    #     print(content)
    #     translation = content
    #     translated_subtitles.append(translation)

    with open(translated_subtitle_path, 'w', encoding='utf-8') as file:
        file.write(translated)

    return translated_subtitle_path

def translation_subtitle_to_openai(client, subtitles):
    messages = [
        {"role": "system", "content": "You are a helpful assistant that translates text from English to Korean."},
    ]
    content = f"""
    Translate the following subtitles to Korean. Please write the technical terms or names in both Korean and English. Please ensure that unnecessary characters, such as "`" or strings like `srt`, are not added outside of the subtitle format.


    """

    for subtitle in subtitles:
        content += f"""
        {subtitle["index"]}
        {subtitle["start_time"]} --> {subtitle["end_time"]}
        {subtitle["text"]}

        """
    messages.append({"role": "user", "content": content})
    chat_completion = client.chat.completions.create(
        messages=messages,
        model="gpt-4o",
    )
    choices = chat_completion.choices
    choice = choices[0]
    return choice.message.content

def parse_srt(srt):
    entries = srt.strip().split('\n\n')
    structured_data = []

    for entry in entries:
        lines = entry.split('\n')
        index = int(lines[0])
        timing = lines[1]
        start_time, end_time = timing.split(' --> ')
        text = '\n'.join(lines[2:])
        structured_data.append({
            'index': index,
            'start_time': start_time,
            'end_time': end_time,
            'text': text
        })

    return structured_data

# opts is nullable
def main(wwdc_url, opts, openai_key):
    try:
        content_id, content_uuid, title, video_url, subtitle_url = fetch_informations(wwdc_url)
        print(f"Content ID: {content_id}")
        print(f"Content UUID: {content_uuid}")
        print(f"Title: {title}")
        print(f"Video URL: {video_url}")
        print(f"Subtitle URL: {subtitle_url}")
        filename = f'{content_id}_{title.lower().replace(" ", "_")}'
        video_url, subtitle_url = download_video_and_subtitle(opts, video_url, subtitle_url, filename)
        subtitle_ko_url = None
        if subtitle_url:
            # translation with openai
            subtitle_ko_url = translation_subtitle_to_ko(openai_key, subtitle_url)
        # video_url and subtitle_url is not none
        if video_url and subtitle_url:
            merge_video_subtitle(video_url, subtitle_url, subtitle_ko_url)

    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    # Arguments
    # --video or --subtitle for download only video or subtitle
    # --openai_key=YOUR_OPENAI_KEY for openai key
    import argparse
    parser = argparse.ArgumentParser(description='Download WWDC video and subtitle')
    parser.add_argument('wwdc_url', type=str, help='WWDC video URL')
    parser.add_argument('--video', action='store_true', help='Download video only')
    parser.add_argument('--subtitle', action='store_true', help='Download subtitle only')
    parser.add_argument('--openai_key', type=str, help='OpenAI API key')
    args = parser.parse_args()
    if args.video:
        opts = "--video"
    elif args.subtitle:
        opts = "--subtitle"
    else:
        opts = None
    main(args.wwdc_url, opts, args.openai_key)
	import sys
	import requests
	import yt_dlp
	import openai

	def get_best_format(m3u8_url):
	ydl_opts = {
	'listformats': True,
	}

	formats = []
	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	result = ydl.extract_info(m3u8_url, download=False)
	formats = result.get('formats', [])

	# Find the best quality format based on resolution, FPS, and TBR
	best_format = max(formats, key=lambda x: (
	x.get('height', 0),
	x.get('fps', 0),
	x.get('tbr', 0)
	))
	return best_format['format_id']

	def get_webvtt_format(m3u8_url):
	ydl_opts = {
	'listformats': True,
	}

	formats = []
	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	result = ydl.extract_info(m3u8_url, download=False)
	formats = result.get('formats', [])

	return formats[0]['format_id']

	def fetch_informations(wwdc_url):
	response = requests.get(wwdc_url)
	if response.status_code != 200:
	raise Exception(f"Failed to fetch WWDC page: {response.status_code}")
	# print(response.text)
	# Parse the page content to find video and subtitle URLs
	# This is a basic example and may need adjustments based on the actual HTML structure
	content = response.text
	# content id is last path of the url
	content_id = wwdc_url.split('/')[-1]
	content_uuid = ""
	title = ""
	video_url = ""
	subtitle_url = ""
	if '<title>' in content:
	title = content.split('<title>')[1].split('</title>')[0]
	# find hd mp4 link
	if '.mp4' in content:
	video_url = content.split('hd.mp4')[0].split('"')[-1] + "hd.mp4"
	content_uuid = video_url.split('/')[-3]

	# subtitle url sample https://devstreaming-cdn.apple.com/videos/wwdc/2024/10118/4/16FC914B-F442-41A4-AFF4-5047A3FF7125/subtitles/eng/prog_index.m3u8
	# video url sample https://devstreaming-cdn.apple.com/videos/wwdc/2024/10171/4/7E1A626A-DE4F-4DEB-A2D9-ECCAAD10A34F/downloads/wwdc2024-10171_hd.mp4
	subtitle_url = video_url.split('downloads')[0] + 'subtitles/eng/prog_index.m3u8'

	if not video_url:
	raise Exception("Could not find video in the page content")

	return content_id, content_uuid, title, video_url, subtitle_url

	def download_video(video_url, output_filename):
	# get best format for video
	# format_id = get_best_format(video_url)
	ydl_opts = {
	# format is the best format id
	# 'format': format_id,
	'format': 'best',
	# output file is content_id_title.extension
	# title is all lowercaseed and spaces are replaced with underscores
	'outtmpl': f'{output_filename}.%(ext)s',
	'overwrites': True,
	}

	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	# ydl.download([video_url])
	# download video and return destination file
	info_dict = ydl.extract_info(video_url, download=True)
	filepath = info_dict['requested_downloads'][0]['filepath']
	print(f"Downloaded video to: {filepath}")
	return filepath

	def download_subtitle(m3u8_url, output_filename):
	import os
	import subprocess
	# ffmpeg download subtitle
	output_file = output_filename + "_en.srt"
	output_path = os.path.join(os.getcwd(), output_file)
	command = [
	'ffmpeg',
	'-i', m3u8_url,
	output_file,
	# force to overwrite
	'-y'
	]
	subprocess.run(command, check=True)
	print(f"Downloaded subtitle to: {output_path}")
	return output_path

	def download_video_and_subtitle(opts, video_url, subtitle_url, output_filename):
	downloaded_video_url = None
	downloaded_subtitle_url = None
	# if opts not none
	if opts:
	if opts == "--video":
	downloaded_video_url = download_video(video_url, output_filename)
	elif opts == "--subtitle":
	downloaded_subtitle_url = download_subtitle(subtitle_url, output_filename)
	else:
	downloaded_video_url = download_video(video_url, output_filename)
	downloaded_subtitle_url = download_subtitle(subtitle_url, output_filename)
	else:
	downloaded_video_url = download_video(video_url, output_filename)
	downloaded_subtitle_url = download_subtitle(subtitle_url, output_filename)
	return downloaded_video_url, downloaded_subtitle_url

	def merge_video_subtitle(video_file, subtitle_file, subtitle_ko_file):
	import subprocess
	# ffmpeg -i input.mp4 -i input.srt -c copy -c:s mov_text output.mp4
	output_file = video_file.split('.')[0] + '_subtitled.mp4'

	if subtitle_ko_file:
	command = [
	'ffmpeg',
	'-i', video_file,
	'-i', subtitle_file,
	'-i', subtitle_ko_file,
	'-c:v', 'copy',
	'-c:a', 'copy',
	'-c:s', 'mov_text', # Use mov_text codec for subtitles
	'-metadata:s:s:0', 'language=eng',
	'-metadata:s:s:1', 'language=kor',
	output_file,
	'-y'
	]
	else:
	command = [
	'ffmpeg',
	'-i', video_file,
	'-i', subtitle_file,
	'-c:v', 'copy',
	'-c:a', 'copy',
	'-c:s', 'mov_text', # Use mov_text codec for subtitles
	'-metadata:s:s:0', 'language=eng',
	output_file
	]
	subprocess.run(command, check=True)
	print(f"Subtitled video saved to: {output_file}")

	def translation_subtitle_to_ko(openai_key, subtitle_path):
	from openai import OpenAI
	translated_subtitle_path = subtitle_path.replace('_en.srt', '_ko.srt')

	# invalid of openai key just announcement
	if not openai_key or openai_key == "":
	print("OpenAI key is not valid. ignore translation.")
	return ""
	with open(subtitle_path, encoding='utf-8') as file:
	subtitles_str = file.read()

	structured_subtitles = parse_srt(subtitles_str)
	client = OpenAI(api_key=openai_key)

	translated = ""
	subtitles = []
	total = len(structured_subtitles)
	progress = 0
	for subtitle_unit in structured_subtitles:
	subtitles.append(subtitle_unit)
	# if lenth of subtitles is 10, start translation
	if len(subtitles) % 10 == 0:
	content = translation_subtitle_to_openai(client, subtitles)
	# trim ``` or ```srt
	content = content.replace('```', '')
	content = content.replace('```srt', '')
	# trimming spaces all lines
	content = '\n'.join([line.strip() for line in content.split('\n')])
	translated += content + "\n\n"
	progress += len(subtitles)
	subtitles = []
	print(f"Translated {progress}/{total} subtitles")
	if len(subtitles) > 0:
	# trim ``` or ```srt
	content = content.replace('```', '')
	content = content.replace('```srt', '')
	# trimming spaces all lines
	content = '\n'.join([line.strip() for line in content.split('\n')])
	translated += content + "\n\n"
	progress += len(subtitles)
	subtitles = []
	print(f"Translated {progress}/{total} subtitles")

	# print(content)
	# translation = content
	# translated_subtitles.append(translation)

	with open(translated_subtitle_path, 'w', encoding='utf-8') as file:
	file.write(translated)

	return translated_subtitle_path

	def translation_subtitle_to_openai(client, subtitles):
	messages = [
	{"role": "system", "content": "You are a helpful assistant that translates text from English to Korean."},
	]
	content = f"""
	Translate the following subtitles to Korean. Please write the technical terms or names in both Korean and English. Please ensure that unnecessary characters, such as "`" or strings like `srt`, are not added outside of the subtitle format.


	"""

	for subtitle in subtitles:
	content += f"""
	{subtitle["index"]}
	{subtitle["start_time"]} --> {subtitle["end_time"]}
	{subtitle["text"]}

	"""
	messages.append({"role": "user", "content": content})
	chat_completion = client.chat.completions.create(
	messages=messages,
	model="gpt-4o",
	)
	choices = chat_completion.choices
	choice = choices[0]
	return choice.message.content

	def parse_srt(srt):
	entries = srt.strip().split('\n\n')
	structured_data = []

	for entry in entries:
	lines = entry.split('\n')
	index = int(lines[0])
	timing = lines[1]
	start_time, end_time = timing.split(' --> ')
	text = '\n'.join(lines[2:])
	structured_data.append({
	'index': index,
	'start_time': start_time,
	'end_time': end_time,
	'text': text
	})

	return structured_data

	# opts is nullable
	def main(wwdc_url, opts, openai_key):
	try:
	content_id, content_uuid, title, video_url, subtitle_url = fetch_informations(wwdc_url)
	print(f"Content ID: {content_id}")
	print(f"Content UUID: {content_uuid}")
	print(f"Title: {title}")
	print(f"Video URL: {video_url}")
	print(f"Subtitle URL: {subtitle_url}")
	filename = f'{content_id}_{title.lower().replace(" ", "_")}'
	video_url, subtitle_url = download_video_and_subtitle(opts, video_url, subtitle_url, filename)
	subtitle_ko_url = None
	if subtitle_url:
	# translation with openai
	subtitle_ko_url = translation_subtitle_to_ko(openai_key, subtitle_url)
	# video_url and subtitle_url is not none
	if video_url and subtitle_url:
	merge_video_subtitle(video_url, subtitle_url, subtitle_ko_url)

	except Exception as e:
	print(f"Error: {e}")

	if __name__ == "__main__":
	# Arguments
	# --video or --subtitle for download only video or subtitle
	# --openai_key=YOUR_OPENAI_KEY for openai key
	import argparse
	parser = argparse.ArgumentParser(description='Download WWDC video and subtitle')
	parser.add_argument('wwdc_url', type=str, help='WWDC video URL')
	parser.add_argument('--video', action='store_true', help='Download video only')
	parser.add_argument('--subtitle', action='store_true', help='Download subtitle only')
	parser.add_argument('--openai_key', type=str, help='OpenAI API key')
	args = parser.parse_args()
	if args.video:
	opts = "--video"
	elif args.subtitle:
	opts = "--subtitle"
	else:
	opts = None
	main(args.wwdc_url, opts, args.openai_key)