RyanFleck/transcribe_video.py

## transcribe_video.py
# transcribe_video.py

# A script to strip the audio from a video, split the audio into chunks, and send
# each chunk through the Google voice-to-text library to get the transcript.

# Requires: pydub, moviepy, numpy==1.19.3
# FFMPEG must also be installed on the system.
# Tested on Windows 10.

import speech_recognition as sr
from moviepy.editor import AudioFileClip
import os
from pydub import AudioSegment
from pydub.silence import split_on_silence

# Sources:
# https://www.thepythoncode.com/article/using-speech-recognition-to-convert-speech-to-text-python
# https://medium.com/python-in-plain-english/how-to-extract-audio-from-video-in-python-51c4dcd5989f

r = sr.Recognizer()
r.energy_threshold = 300


def clean_text(text:str):
    text = text.strip()
    text = text[0].upper() + text[1:]
    return text + ". "

def split_and_transcribe(path):
    print("Opening audio segment...")
    sound = AudioSegment.from_wav(path)
    print("Splitting audio file...")
    chunks = split_on_silence(
        sound, min_silence_len=1000, silence_thresh=sound.dBFS - 14, keep_silence=500
    )
    folder_name = "audio-chunks"

    if not os.path.isdir(folder_name):
        print("Creating a new temp directory for audio chunks.")
        os.mkdir(folder_name)

    document = ""

    for i, audio_chunk in enumerate(chunks, start=1):
        print(f"Exporting chunk {i}")
        chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
        audio_chunk.export(chunk_filename, format="wav")
        print("Opening chunk...")
        with sr.AudioFile(chunk_filename) as source:
            print(f"Recognizing chunk with name {chunk_filename}")
            audio_listened = r.record(source)
            print("Converting to text...")
            try:
                text: str = r.recognize_google(audio_listened)
            except sr.UnknownValueError as e:
                print("Unknown Value Error: ", str(e))
                if document and not document.endswith("\n"):
                    document += "\n\n"
            else:
                text: str = clean_text(text)
                print(f"{chunk_filename} =>  '{text}'")
                document += text

    return document


def extractname(filepath):
    base = os.path.basename(filepath)
    base_split = base.split(".")[:-1]
    return ".".join(base_split)


def transcribe(filename, to_filename):
    print(f"Transcribing {filename} => {to_filename}")
    return split_and_transcribe(filename)


def replace_common_transcription_errors(text: str):
    """Replace AND and BUT sentence starts/ends, and other errors."""
    print("Replacing transcription errors...")
    text = text.replace("and.", "and...")
    text = text.replace("and... And", "and...")
    text = text.replace(". And ", ", and ")
    text = text.replace(". But ", ", but ")
    # text = text.replace(" i ", " I ")
    return text


def main():
    movies = []
    print("Finding all movies...")
    for file in os.listdir():
        if file.endswith("mp4"):
            movies.append(file)
        if file.endswith("webm"):
            movies.append(file)

    if len(movies) > 0:
        for file in movies:
            print(f"Processing Movie '{file}'")
            filename = extractname(file)
            transcribedfile = f"{filename}.txt"
            if os.path.exists(transcribedfile):
                print("A transcription already exists: "+str(transcribedfile))
            else:
                print("No text file found. Transcribing...")
                print("Converting to audio...")
                audio = AudioFileClip(str(file))
                audio.write_audiofile(f"{filename}.wav")

                text = transcribe(f"{filename}.wav", transcribedfile)
                text = replace_common_transcription_errors(text)
                print("\nGot Text:\n\n")
                print(text)
                print()
                with open(transcribedfile, "w") as file:
                    print("Writing to file...")
                    file.write(f"Full Transcript for Audio File {filename}.MP3\n\n")
                    file.write(text)
                    file.close()


main()
	# transcribe_video.py

	# A script to strip the audio from a video, split the audio into chunks, and send
	# each chunk through the Google voice-to-text library to get the transcript.

	# Requires: pydub, moviepy, numpy==1.19.3
	# FFMPEG must also be installed on the system.
	# Tested on Windows 10.

	import speech_recognition as sr
	from moviepy.editor import AudioFileClip
	import os
	from pydub import AudioSegment
	from pydub.silence import split_on_silence

	# Sources:
	# https://www.thepythoncode.com/article/using-speech-recognition-to-convert-speech-to-text-python
	# https://medium.com/python-in-plain-english/how-to-extract-audio-from-video-in-python-51c4dcd5989f

	r = sr.Recognizer()
	r.energy_threshold = 300


	def clean_text(text:str):
	text = text.strip()
	text = text[0].upper() + text[1:]
	return text + ". "

	def split_and_transcribe(path):
	print("Opening audio segment...")
	sound = AudioSegment.from_wav(path)
	print("Splitting audio file...")
	chunks = split_on_silence(
	sound, min_silence_len=1000, silence_thresh=sound.dBFS - 14, keep_silence=500
	)
	folder_name = "audio-chunks"

	if not os.path.isdir(folder_name):
	print("Creating a new temp directory for audio chunks.")
	os.mkdir(folder_name)

	document = ""

	for i, audio_chunk in enumerate(chunks, start=1):
	print(f"Exporting chunk {i}")
	chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
	audio_chunk.export(chunk_filename, format="wav")
	print("Opening chunk...")
	with sr.AudioFile(chunk_filename) as source:
	print(f"Recognizing chunk with name {chunk_filename}")
	audio_listened = r.record(source)
	print("Converting to text...")
	try:
	text: str = r.recognize_google(audio_listened)
	except sr.UnknownValueError as e:
	print("Unknown Value Error: ", str(e))
	if document and not document.endswith("\n"):
	document += "\n\n"
	else:
	text: str = clean_text(text)
	print(f"{chunk_filename} => '{text}'")
	document += text

	return document


	def extractname(filepath):
	base = os.path.basename(filepath)
	base_split = base.split(".")[:-1]
	return ".".join(base_split)


	def transcribe(filename, to_filename):
	print(f"Transcribing {filename} => {to_filename}")
	return split_and_transcribe(filename)


	def replace_common_transcription_errors(text: str):
	"""Replace AND and BUT sentence starts/ends, and other errors."""
	print("Replacing transcription errors...")
	text = text.replace("and.", "and...")
	text = text.replace("and... And", "and...")
	text = text.replace(". And ", ", and ")
	text = text.replace(". But ", ", but ")
	# text = text.replace(" i ", " I ")
	return text


	def main():
	movies = []
	print("Finding all movies...")
	for file in os.listdir():
	if file.endswith("mp4"):
	movies.append(file)
	if file.endswith("webm"):
	movies.append(file)

	if len(movies) > 0:
	for file in movies:
	print(f"Processing Movie '{file}'")
	filename = extractname(file)
	transcribedfile = f"{filename}.txt"
	if os.path.exists(transcribedfile):
	print("A transcription already exists: "+str(transcribedfile))
	else:
	print("No text file found. Transcribing...")
	print("Converting to audio...")
	audio = AudioFileClip(str(file))
	audio.write_audiofile(f"{filename}.wav")

	text = transcribe(f"{filename}.wav", transcribedfile)
	text = replace_common_transcription_errors(text)
	print("\nGot Text:\n\n")
	print(text)
	print()
	with open(transcribedfile, "w") as file:
	print("Writing to file...")
	file.write(f"Full Transcript for Audio File {filename}.MP3\n\n")
	file.write(text)
	file.close()


	main()