Skip to content

Instantly share code, notes, and snippets.

@diatche
Last active May 30, 2024 08:11
Show Gist options
  • Save diatche/6186513d058521b6fd2a13705263516e to your computer and use it in GitHub Desktop.
Save diatche/6186513d058521b6fd2a13705263516e to your computer and use it in GitHub Desktop.
Audio Transcriber using OpenAI Whisper
import argparse
import subprocess
import os
import math
from openai import OpenAI
MAX_SIZE = 26214400 # Maximum file size (in bytes)
# Parse command line arguments
parser = argparse.ArgumentParser(
description="Transcribe an audio file using OpenAI Whisper."
)
parser.add_argument("file_path", type=str, help="Path to the audio or video file")
args = parser.parse_args()
# Get the API key from the environment
print("Getting API key from environment...")
api_key = os.getenv("OPENAI_API_KEY")
if api_key is None:
raise ValueError("Environment variable OPENAI_API_KEY is not set")
print("API key obtained.")
# Configure OpenAI with the API key
client = OpenAI(api_key=api_key)
def transcribe_file(file_path):
# Check if the file is a video file
if is_video_file(file_path):
file_path = handle_video_file(file_path)
# Calculate duration for each chunk
file_size = os.path.getsize(file_path)
num_chunks = math.ceil(file_size / MAX_SIZE)
if num_chunks > 1:
file_duration = get_audio_duration(file_path)
chunk_duration = (
math.ceil(file_duration / num_chunks) + 10
) # Add 10 seconds for overlap
transcripts = []
chunk_files = [] # Store the paths of the chunk files
# Split the file into chunks and transcribe them sequentially
for i in range(num_chunks):
start_time = max(
0, i * (chunk_duration - 10) - 5
) # Subtract 10 seconds for overlap
chunk_file_path = f"{file_path}_chunk{i}.mp3"
chunk_files.append(chunk_file_path) # Store the path of the chunk file
# Create chunk using ffmpeg
subprocess.run(
[
"ffmpeg",
"-i",
file_path,
"-ss",
str(start_time),
"-t",
str(chunk_duration),
"-vn",
"-acodec",
"libmp3lame",
"-n",
chunk_file_path,
]
)
# Open the chunk file
print(f"Transcribing chunk {i+1}/{num_chunks}...")
with open(chunk_file_path, "rb") as audio_file:
# Transcribe the chunk
response = client.audio.transcriptions.create(
model="whisper-1", file=audio_file
)
# Extract the transcription text from the response
transcript = response.text
transcripts.append(transcript)
print(f"Chunk {i+1} transcribed.")
print(transcript)
# Combine transcripts
transcript = "\n\n".join(transcripts)
# Clean up the chunk files
for chunk_file in chunk_files:
os.remove(chunk_file)
else:
print(f"Transcribing...")
with open(file_path, "rb") as audio_file:
# Transcribe the chunk
response = client.audio.transcriptions.create(
model="whisper-1", file=audio_file
)
# Extract the transcription text from the response
transcript = response.text
print(f"Transcribed.")
return transcript
def get_audio_duration(file_path):
"""Get the duration of an audio file in seconds."""
if not is_readable(file_path):
raise OSError("Unable to read from: " + file_path)
result = subprocess.run(
[
"ffprobe",
"-v",
"error",
"-show_entries",
"format=duration",
"-of",
"default=noprint_wrappers=1:nokey=1",
file_path,
],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
if len(result.stdout) == 0:
raise ValueError("Unable to get audio duration: " + file_path)
duration = float(result.stdout)
return duration
def handle_video_file(video_file_path):
if not is_readable(video_file_path):
raise OSError("Unable to read from: " + video_file_path)
print("Extracting audio from video...")
audio_file_path = os.path.splitext(video_file_path)[0] + "_audio.mp3"
subprocess.run(
[
"ffmpeg",
"-i",
video_file_path,
"-vn",
"-acodec",
"libmp3lame",
"-n",
audio_file_path,
]
)
print("Audio extracted from video.")
return audio_file_path
def is_video_file(file_path):
if not is_readable(file_path):
raise OSError("Unable to read from: " + file_path)
result = subprocess.run(
[
"ffprobe",
"-v",
"error",
"-select_streams",
"v:0",
"-show_entries",
"stream=codec_name",
"-of",
"default=noprint_wrappers=1:nokey=1",
file_path,
],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
return len(result.stdout) > 0
def is_readable(file_path):
try:
with open(file_path, "rb") as f:
return True
except IOError:
return False
# Transcribe
transcript = transcribe_file(args.file_path)
# Print the transcript
print("Transcript:")
print(transcript)
# Create the filename
filename = os.path.splitext(args.file_path)[0] + "_transcript.txt"
# Write the transcript to a file
print("Saving transcript to file...")
with open(filename, "w") as file:
file.write(transcript)
print("Transcript saved to file.")
@diatche
Copy link
Author

diatche commented Jul 23, 2023

This script can be run from the command line as follows:

python3 transcribe.py /path/to/file/audio.mp3

This will transcribe the audio file and write the transcript to a file in same directory as the input. The script will also print the transcript to the console. This depends on the openai library and you will need to have ffmpeg installed.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment