kcarnold/video-subtitles-via-whisper.py

## video-subtitles-via-whisper.py
# Sebastian Raschka 09/24/2022
# Fixed to avoid problems with spaces and other special characters in filenames, Ken Arnold 10/25/2022
#
# Create a new conda environment and packages
#   conda create -n whisper python=3.9
#   conda activate whisper
#   conda install mlxtend -c conda-forge

# Install ffmpeg
# macOS & homebrew
#   brew install ffmpeg
# Ubuntu
#   sudo apt-get ffmpeg

# Install whisper
# from repo https://github.com/openai/whisper
#   pip install git+https://github.com/openai/whisper.git

import subprocess
import os
import os.path as osp
from mlxtend.file_io import find_files
from mlxtend.utils import Counter


all_videos = find_files(substring=".mp4", path="./", recursive=True)
print("Example path:", all_videos[0])
print("Number of videos to process:", len(all_videos))

audio_outdir = "./extracted_audio"
subtitle_outdir = "./generated_subtitles"

for this_dir in (audio_outdir, subtitle_outdir):
    if not osp.exists(this_dir):
        os.mkdir(this_dir)

cnt = Counter()
for v in all_videos:

    base, ext = osp.splitext(v)
    aac_file_out = osp.join(audio_outdir, osp.basename(base)) + ".aac"

    # extract audio file from video
    subprocess.run(["ffmpeg", "-i", "file:"+v, "-vn", "-acodec", "copy", "file:"+aac_file_out])

    subprocess.run(["whisper", aac_file_out, "--model", "medium", "--language", "English",
        "--output_dir", subtitle_outdir, "--verbose", "False"])
    cnt.update()
	# Sebastian Raschka 09/24/2022
	# Fixed to avoid problems with spaces and other special characters in filenames, Ken Arnold 10/25/2022
	#
	# Create a new conda environment and packages
	# conda create -n whisper python=3.9
	# conda activate whisper
	# conda install mlxtend -c conda-forge

	# Install ffmpeg
	# macOS & homebrew
	# brew install ffmpeg
	# Ubuntu
	# sudo apt-get ffmpeg

	# Install whisper
	# from repo https://github.com/openai/whisper
	# pip install git+https://github.com/openai/whisper.git

	import subprocess
	import os
	import os.path as osp
	from mlxtend.file_io import find_files
	from mlxtend.utils import Counter


	all_videos = find_files(substring=".mp4", path="./", recursive=True)
	print("Example path:", all_videos[0])
	print("Number of videos to process:", len(all_videos))

	audio_outdir = "./extracted_audio"
	subtitle_outdir = "./generated_subtitles"

	for this_dir in (audio_outdir, subtitle_outdir):
	if not osp.exists(this_dir):
	os.mkdir(this_dir)

	cnt = Counter()
	for v in all_videos:

	base, ext = osp.splitext(v)
	aac_file_out = osp.join(audio_outdir, osp.basename(base)) + ".aac"

	# extract audio file from video
	subprocess.run(["ffmpeg", "-i", "file:"+v, "-vn", "-acodec", "copy", "file:"+aac_file_out])

	subprocess.run(["whisper", aac_file_out, "--model", "medium", "--language", "English",
	"--output_dir", subtitle_outdir, "--verbose", "False"])
	cnt.update()