Skip to content

Instantly share code, notes, and snippets.

@alexpovel
Last active January 15, 2023 17:04
Show Gist options
  • Save alexpovel/8e0d051a0a552028258c779588f9d8ec to your computer and use it in GitHub Desktop.
Save alexpovel/8e0d051a0a552028258c779588f9d8ec to your computer and use it in GitHub Desktop.
Converting mp4 videos (like lecture notes) to text (transcribing) using Python, its `speech_recognition` libs, and `openai/whisper`. Made reproducible via Docker.
#!/usr/bin/env python3
import logging
import sys
from pathlib import Path
from subprocess import run
import speech_recognition as sr
RECOGNIZER = sr.Recognizer()
files = {Path(file.strip()) for file in sys.stdin.read().split("\n") if file.strip()}
target_language = sys.argv[1]
def convert_to_audio(file: Path, to_suffix: str) -> Path:
new_file = file.with_suffix(to_suffix)
logging.info(f"Converting {file} to {new_file}...")
if new_file.exists():
logging.warning(f"File {new_file} exists, skipping audio conversion.")
else:
run(
["ffmpeg", "-i", str(file.absolute()), "-vn", str(new_file.absolute())],
check=True,
)
logging.info(f"Converted {file} to {new_file}.")
return new_file
def process(file: Path) -> None:
audio_file = convert_to_audio(file, to_suffix=".flac")
logging.warning(f"Starting audio processing on {audio_file}...")
text_file = file.with_suffix(".transcribed.txt")
if text_file.exists():
logging.warning(f"File {text_file} exists, skipping audio recognition.")
else:
logging.warning(f"Getting audio of {audio_file}...")
with sr.AudioFile(str(audio_file)) as source:
audio = RECOGNIZER.record(source)
logging.warning(f"Got audio of {audio_file}, starting recognition...")
text = RECOGNIZER.recognize_whisper(audio, language=target_language)
logging.warning(f"Finished audio processing on {audio_file}.")
with open(text_file, mode="w") as f:
f.write(text)
logging.warning(f"Wrote results to {text_file}.")
def main():
for file in files:
process(file)
# Can also do this via multiprocessing, however the recognition library itself seems to
# be doing well utilizing multiple cores, so the following isn't really necessary.
# from multiprocessing import Pool, cpu_count
# with Pool(processes=cpu_count() - 1) as p:
# p.map(process, files)
if __name__ == "__main__":
main()
FROM python:3.10.8-bullseye
RUN apt-get update && apt-get install --yes --no-install-recommends \
python3-pyaudio=0.2.11-1.3+b1 \
ffmpeg=7:4.3.5-0+deb11u1
# Alternatively, use git+https://github.com/openai/whisper.git
RUN pip install --no-cache-dir \
speechrecognition==3.9.0 \
whisper.ai==1.0.0.1
COPY convert.py .
docker build --tag transcript .
docker run -v ${PWD}/videos:/videos transcript bash -c 'ls /videos/*.mp4 | python /convert.py german'
#!/bin/bash
docker build --tag transcript . && docker run -v $(pwd)/videos:/videos transcript bash -c 'ls /videos/*.mp4 | python /convert.py english'
@alexpovel
Copy link
Author

Prerequisites are:

  • Docker installation
  • Videos to transcribe in mp4 format in videos subdirectory

After conversion, the videos directory will contain new flac as well as transcribed.txt files.
The flac files are intermediate files, required only because speechrecognition can't handle video files directly (yet?).

@alexpovel
Copy link
Author

$ docker --version
Docker version 20.10.21, build baeda1f

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment