Last active
September 19, 2023 21:14
-
-
Save rasbt/0d09932c861851f177bd8f13dc93b354 to your computer and use it in GitHub Desktop.
Script that creates subtitles (closed captions) for all MP4 video files in your current directory
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Sebastian Raschka 09/24/2022 | |
# Create a new conda environment and packages | |
# conda create -n whisper python=3.9 | |
# conda activate whisper | |
# conda install mlxtend -c conda-forge | |
# Install ffmpeg | |
# macOS & homebrew | |
# brew install ffmpeg | |
# Ubuntu | |
# sudo apt-get install ffmpeg | |
# Install whisper | |
# from repo https://github.com/openai/whisper | |
# pip install git+https://github.com/openai/whisper.git | |
import os | |
import os.path as osp | |
from mlxtend.file_io import find_files | |
from mlxtend.utils import Counter | |
all_videos = find_files(substring=".mp4", path="./", recursive=True) | |
print("Example path:", all_videos[0]) | |
print("Number of videos to process:", len(all_videos)) | |
audio_outdir = "./extracted_audio" | |
subtitle_outdir = "./generated_subtitles" | |
for this_dir in (audio_outdir, subtitle_outdir): | |
if not osp.exists(this_dir): | |
os.mkdir(this_dir) | |
cnt = Counter() | |
for v in all_videos: | |
base, ext = osp.splitext(v) | |
aac_file_out = osp.join(audio_outdir, osp.basename(base)) + ".aac" | |
# exctract audio file from video | |
os.system(f"ffmpeg -i {v} -vn -acodec copy {aac_file_out} ") | |
os.system( | |
f"whisper {aac_file_out} --model medium --language English --output_dir {subtitle_outdir} --verbose False" | |
) | |
cnt.update() |
Arg, sorry to hear about the trouble. I never have any files with whitespaces and didn't think about this (old habit of mine to always remove them, e.g., for f in *\ *; do mv "$f" "${f// /_}"; done
in bash)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Took a bit of debugging cause i had the files names with spaces, special characters(@,$,%.. etc) and back slashes which denotes space in file names and also the codec did not worked for me