Skip to content

Instantly share code, notes, and snippets.

@oozoofrog
Last active June 26, 2024 17:34
Show Gist options
  • Save oozoofrog/02aedba77bb9403731bd28be6ccdb39f to your computer and use it in GitHub Desktop.
Save oozoofrog/02aedba77bb9403731bd28be6ccdb39f to your computer and use it in GitHub Desktop.
WWDC 2024 download video and subtitle and merge and translation script
import sys
import requests
import yt_dlp
import openai
def get_best_format(m3u8_url):
ydl_opts = {
'listformats': True,
}
formats = []
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
result = ydl.extract_info(m3u8_url, download=False)
formats = result.get('formats', [])
# Find the best quality format based on resolution, FPS, and TBR
best_format = max(formats, key=lambda x: (
x.get('height', 0),
x.get('fps', 0),
x.get('tbr', 0)
))
return best_format['format_id']
def get_webvtt_format(m3u8_url):
ydl_opts = {
'listformats': True,
}
formats = []
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
result = ydl.extract_info(m3u8_url, download=False)
formats = result.get('formats', [])
return formats[0]['format_id']
def fetch_informations(wwdc_url):
response = requests.get(wwdc_url)
if response.status_code != 200:
raise Exception(f"Failed to fetch WWDC page: {response.status_code}")
# print(response.text)
# Parse the page content to find video and subtitle URLs
# This is a basic example and may need adjustments based on the actual HTML structure
content = response.text
# content id is last path of the url
content_id = wwdc_url.split('/')[-1]
content_uuid = ""
title = ""
video_url = ""
subtitle_url = ""
if '<title>' in content:
title = content.split('<title>')[1].split('</title>')[0]
# find hd mp4 link
if '.mp4' in content:
video_url = content.split('hd.mp4')[0].split('"')[-1] + "hd.mp4"
content_uuid = video_url.split('/')[-3]
# subtitle url sample https://devstreaming-cdn.apple.com/videos/wwdc/2024/10118/4/16FC914B-F442-41A4-AFF4-5047A3FF7125/subtitles/eng/prog_index.m3u8
# video url sample https://devstreaming-cdn.apple.com/videos/wwdc/2024/10171/4/7E1A626A-DE4F-4DEB-A2D9-ECCAAD10A34F/downloads/wwdc2024-10171_hd.mp4
subtitle_url = video_url.split('downloads')[0] + 'subtitles/eng/prog_index.m3u8'
if not video_url:
raise Exception("Could not find video in the page content")
return content_id, content_uuid, title, video_url, subtitle_url
def download_video(video_url, output_filename):
# get best format for video
# format_id = get_best_format(video_url)
ydl_opts = {
# format is the best format id
# 'format': format_id,
'format': 'best',
# output file is content_id_title.extension
# title is all lowercaseed and spaces are replaced with underscores
'outtmpl': f'{output_filename}.%(ext)s',
'overwrites': True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
# ydl.download([video_url])
# download video and return destination file
info_dict = ydl.extract_info(video_url, download=True)
filepath = info_dict['requested_downloads'][0]['filepath']
print(f"Downloaded video to: {filepath}")
return filepath
def download_subtitle(m3u8_url, output_filename):
import os
import subprocess
# ffmpeg download subtitle
output_file = output_filename + "_en.srt"
output_path = os.path.join(os.getcwd(), output_file)
command = [
'ffmpeg',
'-i', m3u8_url,
output_file,
# force to overwrite
'-y'
]
subprocess.run(command, check=True)
print(f"Downloaded subtitle to: {output_path}")
return output_path
def download_video_and_subtitle(opts, video_url, subtitle_url, output_filename):
downloaded_video_url = None
downloaded_subtitle_url = None
# if opts not none
if opts:
if opts == "--video":
downloaded_video_url = download_video(video_url, output_filename)
elif opts == "--subtitle":
downloaded_subtitle_url = download_subtitle(subtitle_url, output_filename)
else:
downloaded_video_url = download_video(video_url, output_filename)
downloaded_subtitle_url = download_subtitle(subtitle_url, output_filename)
else:
downloaded_video_url = download_video(video_url, output_filename)
downloaded_subtitle_url = download_subtitle(subtitle_url, output_filename)
return downloaded_video_url, downloaded_subtitle_url
def merge_video_subtitle(video_file, subtitle_file, subtitle_ko_file):
import subprocess
# ffmpeg -i input.mp4 -i input.srt -c copy -c:s mov_text output.mp4
output_file = video_file.split('.')[0] + '_subtitled.mp4'
if subtitle_ko_file:
command = [
'ffmpeg',
'-i', video_file,
'-i', subtitle_file,
'-i', subtitle_ko_file,
'-c:v', 'copy',
'-c:a', 'copy',
'-c:s', 'mov_text', # Use mov_text codec for subtitles
'-metadata:s:s:0', 'language=eng',
'-metadata:s:s:1', 'language=kor',
output_file,
'-y'
]
else:
command = [
'ffmpeg',
'-i', video_file,
'-i', subtitle_file,
'-c:v', 'copy',
'-c:a', 'copy',
'-c:s', 'mov_text', # Use mov_text codec for subtitles
'-metadata:s:s:0', 'language=eng',
output_file
]
subprocess.run(command, check=True)
print(f"Subtitled video saved to: {output_file}")
def translation_subtitle_to_ko(openai_key, subtitle_path):
from openai import OpenAI
translated_subtitle_path = subtitle_path.replace('_en.srt', '_ko.srt')
# invalid of openai key just announcement
if not openai_key or openai_key == "":
print("OpenAI key is not valid. ignore translation.")
return ""
with open(subtitle_path, encoding='utf-8') as file:
subtitles_str = file.read()
structured_subtitles = parse_srt(subtitles_str)
client = OpenAI(api_key=openai_key)
translated = ""
subtitles = []
total = len(structured_subtitles)
progress = 0
for subtitle_unit in structured_subtitles:
subtitles.append(subtitle_unit)
# if lenth of subtitles is 10, start translation
if len(subtitles) % 10 == 0:
content = translation_subtitle_to_openai(client, subtitles)
# trim ``` or ```srt
content = content.replace('```', '')
content = content.replace('```srt', '')
# trimming spaces all lines
content = '\n'.join([line.strip() for line in content.split('\n')])
translated += content + "\n\n"
progress += len(subtitles)
subtitles = []
print(f"Translated {progress}/{total} subtitles")
if len(subtitles) > 0:
# trim ``` or ```srt
content = content.replace('```', '')
content = content.replace('```srt', '')
# trimming spaces all lines
content = '\n'.join([line.strip() for line in content.split('\n')])
translated += content + "\n\n"
progress += len(subtitles)
subtitles = []
print(f"Translated {progress}/{total} subtitles")
# print(content)
# translation = content
# translated_subtitles.append(translation)
with open(translated_subtitle_path, 'w', encoding='utf-8') as file:
file.write(translated)
return translated_subtitle_path
def translation_subtitle_to_openai(client, subtitles):
messages = [
{"role": "system", "content": "You are a helpful assistant that translates text from English to Korean."},
]
content = f"""
Translate the following subtitles to Korean. Please write the technical terms or names in both Korean and English. Please ensure that unnecessary characters, such as "`" or strings like `srt`, are not added outside of the subtitle format.
"""
for subtitle in subtitles:
content += f"""
{subtitle["index"]}
{subtitle["start_time"]} --> {subtitle["end_time"]}
{subtitle["text"]}
"""
messages.append({"role": "user", "content": content})
chat_completion = client.chat.completions.create(
messages=messages,
model="gpt-4o",
)
choices = chat_completion.choices
choice = choices[0]
return choice.message.content
def parse_srt(srt):
entries = srt.strip().split('\n\n')
structured_data = []
for entry in entries:
lines = entry.split('\n')
index = int(lines[0])
timing = lines[1]
start_time, end_time = timing.split(' --> ')
text = '\n'.join(lines[2:])
structured_data.append({
'index': index,
'start_time': start_time,
'end_time': end_time,
'text': text
})
return structured_data
# opts is nullable
def main(wwdc_url, opts, openai_key):
try:
content_id, content_uuid, title, video_url, subtitle_url = fetch_informations(wwdc_url)
print(f"Content ID: {content_id}")
print(f"Content UUID: {content_uuid}")
print(f"Title: {title}")
print(f"Video URL: {video_url}")
print(f"Subtitle URL: {subtitle_url}")
filename = f'{content_id}_{title.lower().replace(" ", "_")}'
video_url, subtitle_url = download_video_and_subtitle(opts, video_url, subtitle_url, filename)
subtitle_ko_url = None
if subtitle_url:
# translation with openai
subtitle_ko_url = translation_subtitle_to_ko(openai_key, subtitle_url)
# video_url and subtitle_url is not none
if video_url and subtitle_url:
merge_video_subtitle(video_url, subtitle_url, subtitle_ko_url)
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
# Arguments
# --video or --subtitle for download only video or subtitle
# --openai_key=YOUR_OPENAI_KEY for openai key
import argparse
parser = argparse.ArgumentParser(description='Download WWDC video and subtitle')
parser.add_argument('wwdc_url', type=str, help='WWDC video URL')
parser.add_argument('--video', action='store_true', help='Download video only')
parser.add_argument('--subtitle', action='store_true', help='Download subtitle only')
parser.add_argument('--openai_key', type=str, help='OpenAI API key')
args = parser.parse_args()
if args.video:
opts = "--video"
elif args.subtitle:
opts = "--subtitle"
else:
opts = None
main(args.wwdc_url, opts, args.openai_key)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment