Skip to content

Instantly share code, notes, and snippets.

@fiddyschmitt
Created May 29, 2023 05:37
Show Gist options
  • Save fiddyschmitt/80892610ab58dd36ce7c619a4c3379ae to your computer and use it in GitHub Desktop.
Save fiddyschmitt/80892610ab58dd36ce7c619a4c3379ae to your computer and use it in GitHub Desktop.
Generate subtitles/captions for MP4 file using OpenAI Whisper
import os
import whisper
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
#Requirements:
#pip install -U openai-whisper
#winget install ffmpeg
#copy ffmpeg.exe from this folder
#C:\Users\foo\AppData\Local\Microsoft\WinGet\Packages\Gyan.FFmpeg_Microsoft.Winget.Source_8wekyb3d8bbwe\ffmpeg-6.0-full_build\bin\ffmpeg.exe
def transcribe_file(input_file, model):
input_filename = os.path.basename(input_file)
output_folder = os.path.dirname(input_file)
try:
print("Processing: " + input_filename)
txt_output_file = Path(input_file).with_suffix('.txt')
srt_output_file = Path(input_file).with_suffix('.srt')
if os.path.isfile(txt_output_file):
print('Output already exists: ' + os.path.basename(txt_output_file))
return
result = model.transcribe(input_file)
# with open(txt_output_file, 'w') as f:
# f.write(result["text"])
srt_writer = whisper.utils.get_writer("srt", output_folder)
srt_writer(result, srt_output_file)
txt_writer = whisper.utils.get_writer("txt", output_folder)
txt_writer(result, txt_output_file)
except Exception as e:
print("[" + input_filename + f"] An error occurred: {type(e).__name__} - {str(e)}")
def main():
input_folder_str = r"H:\Videos"
input_folder = Path(input_folder_str)
input_files = list(input_folder.glob('**/*.mp4'))
#input_files = list(input_folder.glob('**/*.wav'))
model = whisper.load_model("base")
# Use a ProcessPoolExecutor to run the transcriptions in parallel
with ProcessPoolExecutor(max_workers=4) as executor:
for input_file in input_files:
#Seems to crash if run in parallel. Not sure exactly why, but I think github discussion said the module wasn't thread safe
#executor.submit(transcribe_file, input_file, model)
transcribe_file(str(input_file), model)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment