laubonghaudoi/transcribe_cantonese.py

## transcribe_cantonese.py
import librosa
import torch
from transformers import pipeline
from time import time

# Load the audio file
y, sr = librosa.load('test.mp3', sr=16000)

MODEL_NAME = "alvanlii/whisper-small-cantonese"

# Initialize the pipeline
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = pipeline(
    "automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device
)

# Set forced decoder IDs for Cantonese transcription
pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(
    language='zh', task="transcribe"
)

# Define chunk size (in samples) and stride
chunk_size = 30 * sr  # 30 seconds
stride = 5 * sr  # 5 seconds

# Create chunks with overlap
chunks = [y[i:i + chunk_size] for i in range(0, len(y), chunk_size - stride)]

# Process chunks in parallel using the pipeline's batching capability

start_time = time()
results = pipe(chunks, batch_size=8)
end_time = time()
elapsed_time = end_time - start_time

# Combine all transcriptions
full_transcription = '。'.join(result['text'] for result in results)
print(full_transcription)
print(f"Time taken for transcription: {elapsed_time} seconds")
	import librosa
	import torch
	from transformers import pipeline
	from time import time

	# Load the audio file
	y, sr = librosa.load('test.mp3', sr=16000)

	MODEL_NAME = "alvanlii/whisper-small-cantonese"

	# Initialize the pipeline
	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	pipe = pipeline(
	"automatic-speech-recognition",
	model=MODEL_NAME,
	chunk_length_s=30,
	device=device
	)

	# Set forced decoder IDs for Cantonese transcription
	pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(
	language='zh', task="transcribe"
	)

	# Define chunk size (in samples) and stride
	chunk_size = 30 * sr # 30 seconds
	stride = 5 * sr # 5 seconds

	# Create chunks with overlap
	chunks = [y[i:i + chunk_size] for i in range(0, len(y), chunk_size - stride)]

	# Process chunks in parallel using the pipeline's batching capability

	start_time = time()
	results = pipe(chunks, batch_size=8)
	end_time = time()
	elapsed_time = end_time - start_time

	# Combine all transcriptions
	full_transcription = '。'.join(result['text'] for result in results)
	print(full_transcription)
	print(f"Time taken for transcription: {elapsed_time} seconds")