mill1000/transcribe.py

## transcribe.py
#!/usr/bin/env python3

import whisper
import argparse
import os
import math
import datetime
import sys

if __name__ == "__main__":

    # Argument parsing
    parser = argparse.ArgumentParser(description="Transcribe files using Whisper and output a summary.",
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "--model", help="Whisper model to use.", default="base.en")
    parser.add_argument("--device", help="Whisper device to use.",
                        default="cuda", choices=["cuda", "cpu"])
    parser.add_argument(
        "--rename", help="Rename files with the transcribed text.", action="store_true")
    parser.add_argument("--output", help="File to output to.",
                        type=argparse.FileType('w'), default=sys.stdout)
    parser.add_argument("files", help="Audio files to transcribe.", nargs="+")
    args = parser.parse_args()

    def convert_time(time):
        """Convert a float representation of seconds to a time object."""
        microseconds, total_seconds = math.modf(time)
        minutes = total_seconds / 60
        seconds = total_seconds % 60
        return datetime.time(minute=int(minutes), second=int(seconds), microsecond=int(microseconds * 1e6))

    def eprint(*a, **kwargs):
        """Print to  stderr."""
        print(*a, file=sys.stderr, **kwargs)

    def oprint(*a, **kwargs):
        """Print to the specified output file."""
        print(*a, file=args.output, **kwargs)

    eprint(f"Transcribing {len(args.files)} files.")

    # Load the selected Whisper model
    eprint(f"Loading model '{args.model}' using device '{args.device}'.")
    model = whisper.load_model(args.model, device=args.device)

    # Supress warnings from Whisper by selecting proper FP16 mode
    fp16 = True if args.device == "cuda" else False

    # Convert each file
    for file in args.files:
        eprint(f"Transcribing '{file}'.")

        oprint(f"{file}")
        if not os.path.exists(file):
            oprint(f"File does not exist.")
            eprint(f"File '{file}' does not exists.")
            continue

        # Transcribe the file
        result = model.transcribe(file, fp16=fp16)

        # Print the complete text
        complete_text = result["text"].strip()
        oprint("Full Text:")
        oprint(f"  {complete_text}")

        # Print each segment with timestamps
        oprint("Segments:")
        for segment in result["segments"]:
            # print(segment)
            text = segment["text"].strip()
            start = convert_time(segment["start"])
            end = convert_time(segment["end"])

            timespan = f"[{start.isoformat(timespec='milliseconds')} -> {end.isoformat(timespec='milliseconds')}]"
            oprint(f"  {timespan}: {text}")

        oprint("")

        if args.rename:
            prefix = os.path.normcase(complete_text[:32])
            path, name = os.path.split(file)
            new_file = os.path.normpath(f"{path}/{prefix} - {name}")
            eprint(f"Renamed '{file}' to '{new_file}'.")
            os.rename(file, new_file)
	#!/usr/bin/env python3

	import whisper
	import argparse
	import os
	import math
	import datetime
	import sys

	if __name__ == "__main__":

	# Argument parsing
	parser = argparse.ArgumentParser(description="Transcribe files using Whisper and output a summary.",
	formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	parser.add_argument(
	"--model", help="Whisper model to use.", default="base.en")
	parser.add_argument("--device", help="Whisper device to use.",
	default="cuda", choices=["cuda", "cpu"])
	parser.add_argument(
	"--rename", help="Rename files with the transcribed text.", action="store_true")
	parser.add_argument("--output", help="File to output to.",
	type=argparse.FileType('w'), default=sys.stdout)
	parser.add_argument("files", help="Audio files to transcribe.", nargs="+")
	args = parser.parse_args()

	def convert_time(time):
	"""Convert a float representation of seconds to a time object."""
	microseconds, total_seconds = math.modf(time)
	minutes = total_seconds / 60
	seconds = total_seconds % 60
	return datetime.time(minute=int(minutes), second=int(seconds), microsecond=int(microseconds * 1e6))

	def eprint(a, *kwargs):
	"""Print to stderr."""
	print(a, file=sys.stderr, *kwargs)

	def oprint(a, *kwargs):
	"""Print to the specified output file."""
	print(a, file=args.output, *kwargs)

	eprint(f"Transcribing {len(args.files)} files.")

	# Load the selected Whisper model
	eprint(f"Loading model '{args.model}' using device '{args.device}'.")
	model = whisper.load_model(args.model, device=args.device)

	# Supress warnings from Whisper by selecting proper FP16 mode
	fp16 = True if args.device == "cuda" else False

	# Convert each file
	for file in args.files:
	eprint(f"Transcribing '{file}'.")

	oprint(f"{file}")
	if not os.path.exists(file):
	oprint(f"File does not exist.")
	eprint(f"File '{file}' does not exists.")
	continue

	# Transcribe the file
	result = model.transcribe(file, fp16=fp16)

	# Print the complete text
	complete_text = result["text"].strip()
	oprint("Full Text:")
	oprint(f" {complete_text}")

	# Print each segment with timestamps
	oprint("Segments:")
	for segment in result["segments"]:
	# print(segment)
	text = segment["text"].strip()
	start = convert_time(segment["start"])
	end = convert_time(segment["end"])

	timespan = f"[{start.isoformat(timespec='milliseconds')} -> {end.isoformat(timespec='milliseconds')}]"
	oprint(f" {timespan}: {text}")

	oprint("")

	if args.rename:
	prefix = os.path.normcase(complete_text[:32])
	path, name = os.path.split(file)
	new_file = os.path.normpath(f"{path}/{prefix} - {name}")
	eprint(f"Renamed '{file}' to '{new_file}'.")
	os.rename(file, new_file)