Skip to content

Instantly share code, notes, and snippets.

@alexpovel
Last active January 15, 2023 17:04
Show Gist options
  • Save alexpovel/8e0d051a0a552028258c779588f9d8ec to your computer and use it in GitHub Desktop.
Save alexpovel/8e0d051a0a552028258c779588f9d8ec to your computer and use it in GitHub Desktop.
Converting mp4 videos (like lecture notes) to text (transcribing) using Python, its `speech_recognition` libs, and `openai/whisper`. Made reproducible via Docker.
#!/usr/bin/env python3
import logging
import sys
from pathlib import Path
from subprocess import run
import speech_recognition as sr
RECOGNIZER = sr.Recognizer()
files = {Path(file.strip()) for file in sys.stdin.read().split("\n") if file.strip()}
target_language = sys.argv[1]
def convert_to_audio(file: Path, to_suffix: str) -> Path:
new_file = file.with_suffix(to_suffix)
logging.info(f"Converting {file} to {new_file}...")
if new_file.exists():
logging.warning(f"File {new_file} exists, skipping audio conversion.")
else:
run(
["ffmpeg", "-i", str(file.absolute()), "-vn", str(new_file.absolute())],
check=True,
)
logging.info(f"Converted {file} to {new_file}.")
return new_file
def process(file: Path) -> None:
audio_file = convert_to_audio(file, to_suffix=".flac")
logging.warning(f"Starting audio processing on {audio_file}...")
text_file = file.with_suffix(".transcribed.txt")
if text_file.exists():
logging.warning(f"File {text_file} exists, skipping audio recognition.")
else:
logging.warning(f"Getting audio of {audio_file}...")
with sr.AudioFile(str(audio_file)) as source:
audio = RECOGNIZER.record(source)
logging.warning(f"Got audio of {audio_file}, starting recognition...")
text = RECOGNIZER.recognize_whisper(audio, language=target_language)
logging.warning(f"Finished audio processing on {audio_file}.")
with open(text_file, mode="w") as f:
f.write(text)
logging.warning(f"Wrote results to {text_file}.")
def main():
for file in files:
process(file)
# Can also do this via multiprocessing, however the recognition library itself seems to
# be doing well utilizing multiple cores, so the following isn't really necessary.
# from multiprocessing import Pool, cpu_count
# with Pool(processes=cpu_count() - 1) as p:
# p.map(process, files)
if __name__ == "__main__":
main()
FROM python:3.10.8-bullseye
RUN apt-get update && apt-get install --yes --no-install-recommends \
python3-pyaudio=0.2.11-1.3+b1 \
ffmpeg=7:4.3.5-0+deb11u1
# Alternatively, use git+https://github.com/openai/whisper.git
RUN pip install --no-cache-dir \
speechrecognition==3.9.0 \
whisper.ai==1.0.0.1
COPY convert.py .
docker build --tag transcript .
docker run -v ${PWD}/videos:/videos transcript bash -c 'ls /videos/*.mp4 | python /convert.py german'
#!/bin/bash
docker build --tag transcript . && docker run -v $(pwd)/videos:/videos transcript bash -c 'ls /videos/*.mp4 | python /convert.py english'
@alexpovel
Copy link
Author

$ docker --version
Docker version 20.10.21, build baeda1f

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment