Last active
June 18, 2023 21:58
-
-
Save rubyu/9079512bd8856cf9e910da2824709d80 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This is a script for creating English subtitle files for movie files. | |
Note that all descendants of the given folder are processed, not just those directly under the folder. | |
Subtitles are transcribed from English audio tracks using the Whisper model, so the quality is almost the same as CC (Closed Caption). | |
The srt file generated by this script is automatically loaded by players such as MPC (Media Player Classic) and it's derivatives. | |
This script requires: | |
- NVIDIA graphics card with sufficient memory (12GB+ recommended) | |
- CUDA 11 | |
- cuDNN for CUDA 11 | |
- zlib | |
- ffmpeg and ffprobe | |
- https://github.com/jianfch/stable-ts/ (!! important !!) | |
Note: It's been reported that the original implementation of openai/whisper is sometimes out of sync with the timestamp, | |
but we can relax this problem to a practically acceptable level by using stable-ts instead. | |
Regarding NVIDIA CUDA, I used the following binaries to set up my environment: | |
- cuda_11.8.0_522.06_windows.exe (https://developer.nvidia.com/cuda-toolkit-archive) | |
- cudnn-windows-x86_64-8.9.2.26_cuda11-archive.zip (https://developer.nvidia.com/cudnn) | |
- zlib123dllx64.zip (http://www.winimage.com/zLibDll/zlib123dllx64.zip) | |
""" | |
import os | |
import subprocess | |
import json | |
import stable_whisper | |
model = stable_whisper.load_model('large-v2') | |
ffprobe = 'ffprobe' | |
ffmpeg = 'ffmpeg' | |
def get_streams(filepath): | |
command = [ffprobe, | |
'-v', 'error', | |
'-print_format', 'json', | |
'-show_entries', 'stream=index,codec_type,codec_name:stream_tags=language', | |
filepath] | |
print(f'command: {command}') | |
result = subprocess.run(command, capture_output=True, text=True) | |
stdout = result.stdout | |
stderr = result.stderr | |
print(f'stdout: {stdout}') | |
print(f'stderr: {stderr}') | |
data = json.loads(stdout) | |
return data['streams'] | |
def get_audio_streams(streams, target_langs=['eng']): | |
return list(filter(lambda x: x['codec_type'] == 'audio' and x['tags']['language'] in target_langs, streams)) | |
def extract_audio_file(input_path, audio_stream, output_path): | |
idx = audio_stream['index'] | |
command = [ffmpeg, | |
'-i', input_path, | |
'-map', f'0:{idx}', | |
'-vn', | |
'-ar', '44100', | |
'-acodec', 'pcm_s16le', output_path] | |
print(f'command: {command}') | |
result = subprocess.run(command, capture_output=True, text=True) | |
stdout = result.stdout | |
stderr = result.stderr | |
print(f'stdout: {stdout}') | |
print(f'stderr: {stderr}') | |
def transcribe(path): | |
result = model.transcribe(path, language='en') | |
return result | |
def process_file(path): | |
print(f'Processing file: {path}') | |
if path.endswith('.srt'): | |
print('srt file; skip') | |
return | |
if os.path.exists(f'{path}.eng.1.srt'): | |
print('srt file already exists; skip') | |
return | |
tmp_file = '_tmp.wav' | |
try: | |
streams = get_streams(path) | |
audio_streams = get_audio_streams(streams) | |
for i, stream in enumerate(audio_streams): | |
try: | |
if os.path.exists(tmp_file): | |
print(f'removing {tmp_file}') | |
os.remove(tmp_file) | |
extract_audio_file(path, stream, tmp_file) | |
res = transcribe(tmp_file) | |
res.to_srt_vtt(f'{path}.eng.{i+1}.srt') | |
except Exception as ex: | |
print(ex) | |
except Exception as ex: | |
print(ex) | |
def process_files(directory): | |
for dir_path, dir_names, filenames in os.walk(directory): | |
for name in filenames: | |
process_file(os.path.join(dir_path, name)) | |
if __name__ == '__main__': | |
process_files(r'path_to_your_movie_folder') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment