rubyu/gen_anime_cc_subtitle.py

## gen_anime_cc_subtitle.py
"""
This is a script for creating English subtitle files for movie files.
Note that all descendants of the given folder are processed, not just those directly under the folder.

Subtitles are transcribed from English audio tracks using the Whisper model, so the quality is almost the same as CC (Closed Caption).
The srt file generated by this script is automatically loaded by players such as MPC (Media Player Classic) and it's derivatives.


This script requires:
- NVIDIA graphics card with sufficient memory (12GB+ recommended)
- CUDA 11
- cuDNN for CUDA 11
- zlib
- ffmpeg and ffprobe
- https://github.com/jianfch/stable-ts/ (!! important !!)

Note: It's been reported that the original implementation of openai/whisper is sometimes out of sync with the timestamp,
but we can relax this problem to a practically acceptable level by using stable-ts instead.

Regarding NVIDIA CUDA, I used the following binaries to set up my environment:
- cuda_11.8.0_522.06_windows.exe (https://developer.nvidia.com/cuda-toolkit-archive)
- cudnn-windows-x86_64-8.9.2.26_cuda11-archive.zip (https://developer.nvidia.com/cudnn)
- zlib123dllx64.zip (http://www.winimage.com/zLibDll/zlib123dllx64.zip)
"""


import os
import subprocess
import json

import stable_whisper
model = stable_whisper.load_model('large-v2')

ffprobe = 'ffprobe'
ffmpeg = 'ffmpeg'


def get_streams(filepath):
    command = [ffprobe,
               '-v', 'error',
               '-print_format', 'json',
               '-show_entries', 'stream=index,codec_type,codec_name:stream_tags=language',
               filepath]
    print(f'command: {command}')
    result = subprocess.run(command, capture_output=True, text=True)
    stdout = result.stdout
    stderr = result.stderr
    print(f'stdout: {stdout}')
    print(f'stderr: {stderr}')
    data = json.loads(stdout)
    return data['streams']


def get_audio_streams(streams, target_langs=['eng']):
    return list(filter(lambda x: x['codec_type'] == 'audio' and x['tags']['language'] in target_langs, streams))


def extract_audio_file(input_path, audio_stream, output_path):
    idx = audio_stream['index']
    command = [ffmpeg,
               '-i', input_path,
               '-map', f'0:{idx}',
               '-vn',
               '-ar', '44100',
               '-acodec', 'pcm_s16le', output_path]
    print(f'command: {command}')
    result = subprocess.run(command, capture_output=True, text=True)
    stdout = result.stdout
    stderr = result.stderr
    print(f'stdout: {stdout}')
    print(f'stderr: {stderr}')


def transcribe(path):
    result = model.transcribe(path, language='en')
    return result


def process_file(path):
    print(f'Processing file: {path}')
    if path.endswith('.srt'):
        print('srt file; skip')
        return
    if os.path.exists(f'{path}.eng.1.srt'):
        print('srt file already exists; skip')
        return

    tmp_file = '_tmp.wav'
    try:
        streams = get_streams(path)
        audio_streams = get_audio_streams(streams)
        for i, stream in enumerate(audio_streams):
            try:
                if os.path.exists(tmp_file):
                    print(f'removing {tmp_file}')
                    os.remove(tmp_file)
                extract_audio_file(path, stream, tmp_file)
                res = transcribe(tmp_file)
                res.to_srt_vtt(f'{path}.eng.{i+1}.srt')
            except Exception as ex:
                print(ex)
    except Exception as ex:
        print(ex)


def process_files(directory):
    for dir_path, dir_names, filenames in os.walk(directory):
        for name in filenames:
            process_file(os.path.join(dir_path, name))


if __name__ == '__main__':
    process_files(r'path_to_your_movie_folder')
	"""
	This is a script for creating English subtitle files for movie files.
	Note that all descendants of the given folder are processed, not just those directly under the folder.

	Subtitles are transcribed from English audio tracks using the Whisper model, so the quality is almost the same as CC (Closed Caption).
	The srt file generated by this script is automatically loaded by players such as MPC (Media Player Classic) and it's derivatives.


	This script requires:
	- NVIDIA graphics card with sufficient memory (12GB+ recommended)
	- CUDA 11
	- cuDNN for CUDA 11
	- zlib
	- ffmpeg and ffprobe
	- https://github.com/jianfch/stable-ts/ (!! important !!)

	Note: It's been reported that the original implementation of openai/whisper is sometimes out of sync with the timestamp,
	but we can relax this problem to a practically acceptable level by using stable-ts instead.

	Regarding NVIDIA CUDA, I used the following binaries to set up my environment:
	- cuda_11.8.0_522.06_windows.exe (https://developer.nvidia.com/cuda-toolkit-archive)
	- cudnn-windows-x86_64-8.9.2.26_cuda11-archive.zip (https://developer.nvidia.com/cudnn)
	- zlib123dllx64.zip (http://www.winimage.com/zLibDll/zlib123dllx64.zip)
	"""


	import os
	import subprocess
	import json

	import stable_whisper
	model = stable_whisper.load_model('large-v2')

	ffprobe = 'ffprobe'
	ffmpeg = 'ffmpeg'


	def get_streams(filepath):
	command = [ffprobe,
	'-v', 'error',
	'-print_format', 'json',
	'-show_entries', 'stream=index,codec_type,codec_name:stream_tags=language',
	filepath]
	print(f'command: {command}')
	result = subprocess.run(command, capture_output=True, text=True)
	stdout = result.stdout
	stderr = result.stderr
	print(f'stdout: {stdout}')
	print(f'stderr: {stderr}')
	data = json.loads(stdout)
	return data['streams']


	def get_audio_streams(streams, target_langs=['eng']):
	return list(filter(lambda x: x['codec_type'] == 'audio' and x['tags']['language'] in target_langs, streams))


	def extract_audio_file(input_path, audio_stream, output_path):
	idx = audio_stream['index']
	command = [ffmpeg,
	'-i', input_path,
	'-map', f'0:{idx}',
	'-vn',
	'-ar', '44100',
	'-acodec', 'pcm_s16le', output_path]
	print(f'command: {command}')
	result = subprocess.run(command, capture_output=True, text=True)
	stdout = result.stdout
	stderr = result.stderr
	print(f'stdout: {stdout}')
	print(f'stderr: {stderr}')


	def transcribe(path):
	result = model.transcribe(path, language='en')
	return result


	def process_file(path):
	print(f'Processing file: {path}')
	if path.endswith('.srt'):
	print('srt file; skip')
	return
	if os.path.exists(f'{path}.eng.1.srt'):
	print('srt file already exists; skip')
	return

	tmp_file = '_tmp.wav'
	try:
	streams = get_streams(path)
	audio_streams = get_audio_streams(streams)
	for i, stream in enumerate(audio_streams):
	try:
	if os.path.exists(tmp_file):
	print(f'removing {tmp_file}')
	os.remove(tmp_file)
	extract_audio_file(path, stream, tmp_file)
	res = transcribe(tmp_file)
	res.to_srt_vtt(f'{path}.eng.{i+1}.srt')
	except Exception as ex:
	print(ex)
	except Exception as ex:
	print(ex)


	def process_files(directory):
	for dir_path, dir_names, filenames in os.walk(directory):
	for name in filenames:
	process_file(os.path.join(dir_path, name))


	if __name__ == '__main__':
	process_files(r'path_to_your_movie_folder')