Skip to content

Instantly share code, notes, and snippets.

@laubonghaudoi
Created May 27, 2024 21:31
Show Gist options
  • Save laubonghaudoi/4bcb67ee5f7721402b2ce83f08d65d82 to your computer and use it in GitHub Desktop.
Save laubonghaudoi/4bcb67ee5f7721402b2ce83f08d65d82 to your computer and use it in GitHub Desktop.
轉寫粵語音頻
import librosa
import torch
from transformers import pipeline
from time import time
# Load the audio file
y, sr = librosa.load('test.mp3', sr=16000)
MODEL_NAME = "alvanlii/whisper-small-cantonese"
# Initialize the pipeline
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = pipeline(
"automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device
)
# Set forced decoder IDs for Cantonese transcription
pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(
language='zh', task="transcribe"
)
# Define chunk size (in samples) and stride
chunk_size = 30 * sr # 30 seconds
stride = 5 * sr # 5 seconds
# Create chunks with overlap
chunks = [y[i:i + chunk_size] for i in range(0, len(y), chunk_size - stride)]
# Process chunks in parallel using the pipeline's batching capability
start_time = time()
results = pipe(chunks, batch_size=8)
end_time = time()
elapsed_time = end_time - start_time
# Combine all transcriptions
full_transcription = '。'.join(result['text'] for result in results)
print(full_transcription)
print(f"Time taken for transcription: {elapsed_time} seconds")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment