Skip to content

Instantly share code, notes, and snippets.

@ucalyptus2
Created July 25, 2023 07:23
Show Gist options
  • Save ucalyptus2/0fb5f2548a871e3dd247582a7fb02228 to your computer and use it in GitHub Desktop.
Save ucalyptus2/0fb5f2548a871e3dd247582a7fb02228 to your computer and use it in GitHub Desktop.
from datasets import load_dataset
from transformers.pipelines.pt_utils import KeyDataset
from transformers import pipeline as pipesetup
import tqdm
import jiwer
# Use a streaming dataset
fleurs = load_dataset("google/fleurs", "bn_in", split="train", streaming=True)
# Get an iterator for the dataset
iterator = iter(fleurs)
pred, gt = [], []
pipeline = pipesetup(model="ai4bharat/indicwav2vec_v1_bengali")
for i in tqdm.tqdm(range(1000)): # Adjust the range according to your requirement
# Fetch one item from the dataset
data = next(iterator)
#breakpoint()
inp, out = data["raw_transcription"], pipeline(data["audio"]['array'], max_new_tokens=448)
gt.append(inp)
pred.append(out['text'])
if (i+1) % 10 == 0:
print(f'WER after {i+1} samples: {jiwer.wer(gt,pred)}')
# WER computation after all the samples have been processed
final_wer = jiwer.wer(gt, pred)
print(f'Final WER: {final_wer}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment