Created
May 27, 2024 21:31
-
-
Save laubonghaudoi/4bcb67ee5f7721402b2ce83f08d65d82 to your computer and use it in GitHub Desktop.
轉寫粵語音頻
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import librosa | |
import torch | |
from transformers import pipeline | |
from time import time | |
# Load the audio file | |
y, sr = librosa.load('test.mp3', sr=16000) | |
MODEL_NAME = "alvanlii/whisper-small-cantonese" | |
# Initialize the pipeline | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
pipe = pipeline( | |
"automatic-speech-recognition", | |
model=MODEL_NAME, | |
chunk_length_s=30, | |
device=device | |
) | |
# Set forced decoder IDs for Cantonese transcription | |
pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids( | |
language='zh', task="transcribe" | |
) | |
# Define chunk size (in samples) and stride | |
chunk_size = 30 * sr # 30 seconds | |
stride = 5 * sr # 5 seconds | |
# Create chunks with overlap | |
chunks = [y[i:i + chunk_size] for i in range(0, len(y), chunk_size - stride)] | |
# Process chunks in parallel using the pipeline's batching capability | |
start_time = time() | |
results = pipe(chunks, batch_size=8) | |
end_time = time() | |
elapsed_time = end_time - start_time | |
# Combine all transcriptions | |
full_transcription = '。'.join(result['text'] for result in results) | |
print(full_transcription) | |
print(f"Time taken for transcription: {elapsed_time} seconds") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment