diatche/transcribe.py

## transcribe.py
import argparse
import subprocess
import os
import math
from openai import OpenAI

MAX_SIZE = 26214400  # Maximum file size (in bytes)

# Parse command line arguments
parser = argparse.ArgumentParser(
    description="Transcribe an audio file using OpenAI Whisper."
)
parser.add_argument("file_path", type=str, help="Path to the audio or video file")
args = parser.parse_args()

# Get the API key from the environment
print("Getting API key from environment...")
api_key = os.getenv("OPENAI_API_KEY")
if api_key is None:
    raise ValueError("Environment variable OPENAI_API_KEY is not set")
print("API key obtained.")

# Configure OpenAI with the API key
client = OpenAI(api_key=api_key)

def transcribe_file(file_path):
    # Check if the file is a video file
    if is_video_file(file_path):
        file_path = handle_video_file(file_path)

    # Calculate duration for each chunk
    file_size = os.path.getsize(file_path)
    num_chunks = math.ceil(file_size / MAX_SIZE)

    if num_chunks > 1:
        file_duration = get_audio_duration(file_path)
        chunk_duration = (
            math.ceil(file_duration / num_chunks) + 10
        )  # Add 10 seconds for overlap

        transcripts = []
        chunk_files = []  # Store the paths of the chunk files

        # Split the file into chunks and transcribe them sequentially
        for i in range(num_chunks):
            start_time = max(
                0, i * (chunk_duration - 10) - 5
            )  # Subtract 10 seconds for overlap
            chunk_file_path = f"{file_path}_chunk{i}.mp3"
            chunk_files.append(chunk_file_path)  # Store the path of the chunk file

            # Create chunk using ffmpeg
            subprocess.run(
                [
                    "ffmpeg",
                    "-i",
                    file_path,
                    "-ss",
                    str(start_time),
                    "-t",
                    str(chunk_duration),
                    "-vn",
                    "-acodec",
                    "libmp3lame",
                    "-n",
                    chunk_file_path,
                ]
            )

            # Open the chunk file
            print(f"Transcribing chunk {i+1}/{num_chunks}...")
            with open(chunk_file_path, "rb") as audio_file:
                # Transcribe the chunk
                response = client.audio.transcriptions.create(
                    model="whisper-1", file=audio_file
                )

                # Extract the transcription text from the response
                transcript = response.text
                transcripts.append(transcript)
            print(f"Chunk {i+1} transcribed.")
            print(transcript)

        # Combine transcripts
        transcript = "\n\n".join(transcripts)

        # Clean up the chunk files
        for chunk_file in chunk_files:
            os.remove(chunk_file)
    else:
        print(f"Transcribing...")
        with open(file_path, "rb") as audio_file:
            # Transcribe the chunk
            response = client.audio.transcriptions.create(
                model="whisper-1", file=audio_file
            )

            # Extract the transcription text from the response
            transcript = response.text
        print(f"Transcribed.")

    return transcript

def get_audio_duration(file_path):
    """Get the duration of an audio file in seconds."""
    if not is_readable(file_path):
        raise OSError("Unable to read from: " + file_path)
    result = subprocess.run(
        [
            "ffprobe",
            "-v",
            "error",
            "-show_entries",
            "format=duration",
            "-of",
            "default=noprint_wrappers=1:nokey=1",
            file_path,
        ],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )
    if len(result.stdout) == 0:
        raise ValueError("Unable to get audio duration: " + file_path)
    duration = float(result.stdout)
    return duration

def handle_video_file(video_file_path):
    if not is_readable(video_file_path):
        raise OSError("Unable to read from: " + video_file_path)
    print("Extracting audio from video...")
    audio_file_path = os.path.splitext(video_file_path)[0] + "_audio.mp3"
    subprocess.run(
        [
            "ffmpeg",
            "-i",
            video_file_path,
            "-vn",
            "-acodec",
            "libmp3lame",
            "-n",
            audio_file_path,
        ]
    )
    print("Audio extracted from video.")
    return audio_file_path

def is_video_file(file_path):
    if not is_readable(file_path):
        raise OSError("Unable to read from: " + file_path)
    result = subprocess.run(
        [
            "ffprobe",
            "-v",
            "error",
            "-select_streams",
            "v:0",
            "-show_entries",
            "stream=codec_name",
            "-of",
            "default=noprint_wrappers=1:nokey=1",
            file_path,
        ],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )
    return len(result.stdout) > 0

def is_readable(file_path):
    try:
        with open(file_path, "rb") as f:
            return True
    except IOError:
        return False

# Transcribe
transcript = transcribe_file(args.file_path)

# Print the transcript
print("Transcript:")
print(transcript)

# Create the filename
filename = os.path.splitext(args.file_path)[0] + "_transcript.txt"

# Write the transcript to a file
print("Saving transcript to file...")
with open(filename, "w") as file:
    file.write(transcript)
print("Transcript saved to file.")
	import argparse
	import subprocess
	import os
	import math
	from openai import OpenAI

	MAX_SIZE = 26214400 # Maximum file size (in bytes)

	# Parse command line arguments
	parser = argparse.ArgumentParser(
	description="Transcribe an audio file using OpenAI Whisper."
	)
	parser.add_argument("file_path", type=str, help="Path to the audio or video file")
	args = parser.parse_args()

	# Get the API key from the environment
	print("Getting API key from environment...")
	api_key = os.getenv("OPENAI_API_KEY")
	if api_key is None:
	raise ValueError("Environment variable OPENAI_API_KEY is not set")
	print("API key obtained.")

	# Configure OpenAI with the API key
	client = OpenAI(api_key=api_key)

	def transcribe_file(file_path):
	# Check if the file is a video file
	if is_video_file(file_path):
	file_path = handle_video_file(file_path)

	# Calculate duration for each chunk
	file_size = os.path.getsize(file_path)
	num_chunks = math.ceil(file_size / MAX_SIZE)

	if num_chunks > 1:
	file_duration = get_audio_duration(file_path)
	chunk_duration = (
	math.ceil(file_duration / num_chunks) + 10
	) # Add 10 seconds for overlap

	transcripts = []
	chunk_files = [] # Store the paths of the chunk files

	# Split the file into chunks and transcribe them sequentially
	for i in range(num_chunks):
	start_time = max(
	0, i * (chunk_duration - 10) - 5
	) # Subtract 10 seconds for overlap
	chunk_file_path = f"{file_path}_chunk{i}.mp3"
	chunk_files.append(chunk_file_path) # Store the path of the chunk file

	# Create chunk using ffmpeg
	subprocess.run(
	[
	"ffmpeg",
	"-i",
	file_path,
	"-ss",
	str(start_time),
	"-t",
	str(chunk_duration),
	"-vn",
	"-acodec",
	"libmp3lame",
	"-n",
	chunk_file_path,
	]
	)

	# Open the chunk file
	print(f"Transcribing chunk {i+1}/{num_chunks}...")
	with open(chunk_file_path, "rb") as audio_file:
	# Transcribe the chunk
	response = client.audio.transcriptions.create(
	model="whisper-1", file=audio_file
	)

	# Extract the transcription text from the response
	transcript = response.text
	transcripts.append(transcript)
	print(f"Chunk {i+1} transcribed.")
	print(transcript)

	# Combine transcripts
	transcript = "\n\n".join(transcripts)

	# Clean up the chunk files
	for chunk_file in chunk_files:
	os.remove(chunk_file)
	else:
	print(f"Transcribing...")
	with open(file_path, "rb") as audio_file:
	# Transcribe the chunk
	response = client.audio.transcriptions.create(
	model="whisper-1", file=audio_file
	)

	# Extract the transcription text from the response
	transcript = response.text
	print(f"Transcribed.")

	return transcript

	def get_audio_duration(file_path):
	"""Get the duration of an audio file in seconds."""
	if not is_readable(file_path):
	raise OSError("Unable to read from: " + file_path)
	result = subprocess.run(
	[
	"ffprobe",
	"-v",
	"error",
	"-show_entries",
	"format=duration",
	"-of",
	"default=noprint_wrappers=1:nokey=1",
	file_path,
	],
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	)
	if len(result.stdout) == 0:
	raise ValueError("Unable to get audio duration: " + file_path)
	duration = float(result.stdout)
	return duration

	def handle_video_file(video_file_path):
	if not is_readable(video_file_path):
	raise OSError("Unable to read from: " + video_file_path)
	print("Extracting audio from video...")
	audio_file_path = os.path.splitext(video_file_path)[0] + "_audio.mp3"
	subprocess.run(
	[
	"ffmpeg",
	"-i",
	video_file_path,
	"-vn",
	"-acodec",
	"libmp3lame",
	"-n",
	audio_file_path,
	]
	)
	print("Audio extracted from video.")
	return audio_file_path

	def is_video_file(file_path):
	if not is_readable(file_path):
	raise OSError("Unable to read from: " + file_path)
	result = subprocess.run(
	[
	"ffprobe",
	"-v",
	"error",
	"-select_streams",
	"v:0",
	"-show_entries",
	"stream=codec_name",
	"-of",
	"default=noprint_wrappers=1:nokey=1",
	file_path,
	],
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	)
	return len(result.stdout) > 0

	def is_readable(file_path):
	try:
	with open(file_path, "rb") as f:
	return True
	except IOError:
	return False

	# Transcribe
	transcript = transcribe_file(args.file_path)

	# Print the transcript
	print("Transcript:")
	print(transcript)

	# Create the filename
	filename = os.path.splitext(args.file_path)[0] + "_transcript.txt"

	# Write the transcript to a file
	print("Saving transcript to file...")
	with open(filename, "w") as file:
	file.write(transcript)
	print("Transcript saved to file.")