Skip to content

Instantly share code, notes, and snippets.

@fauxneticien
Created September 17, 2023 02:09
Show Gist options
  • Save fauxneticien/14d4987a95fe67c63c8e9c89a8efc93a to your computer and use it in GitHub Desktop.
Save fauxneticien/14d4987a95fe67c63c8e9c89a8efc93a to your computer and use it in GitHub Desktop.
Fetch LibriLightLimited 10h fine-tuning dataset as TSV file
import torchaudio
import pandas as pd
from pathlib import Path
from tqdm import tqdm
llight10h = torchaudio.datasets.LibriLightLimited(root="tmp/", subset="10h", download=True)
manifest_rows = []
for (audio_samples, sr, text, spk_id, chap_id, utt_id) in tqdm(llight10h):
filename = f"{spk_id}-{chap_id}-{utt_id}.wav"
torchaudio.save(
str(Path("data/LibriLight10h/clips/") / filename),
audio_samples,
sr
)
manifest_rows.append({'path':"clips/" + filename, 'text': text.lower()})
manifest_df = pd.DataFrame(manifest_rows)
manifest_df
manifest_df.to_csv("data/LibriLight10h/_all.tsv", sep="\t", index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment