Skip to content

Instantly share code, notes, and snippets.

@arijitx
Created July 3, 2022 18:22
Show Gist options
  • Save arijitx/60131a92377d383b7bd66a49af0eb324 to your computer and use it in GitHub Desktop.
Save arijitx/60131a92377d383b7bd66a49af0eb324 to your computer and use it in GitHub Desktop.
import json
import sys
from tqdm import tqdm
from tqdm.contrib.concurrent import thread_map
import librosa
import random
print("python create_manifest.py script_path create_train_test_bool(True/False)")
script_path = sys.argv[1]
create_train_test = bool(sys.argv[2])
test_size = 0
if create_train_test:
test_size = int(sys.argv[3])
manifest_list =[]
def process(line):
audio_path = line.split("\t")[0]
duration = librosa.core.get_duration(filename=audio_path)
transcript = line.split("\t")[1].strip()
metadata = {
"audio_filepath": audio_path,
"duration": duration,
"text": transcript
}
return metadata
if __name__ == '__main__':
script = open(script_path, encoding='utf8').readlines()
manifest_list = thread_map(process, script, max_workers=6)
print("total size",len(manifest_list))
random.shuffle(manifest_list)
train = manifest_list[:-test_size]
train_fn = open('train_manifest.json','w',encoding='utf8')
for m in train:
json.dump(m, train_fn,ensure_ascii=False)
train_fn.write("\n")
train_fn.close()
if create_train_test:
test = manifest_list[-test_size:]
test_fn = open('test_manifest.json','w',encoding='utf8')
for m in test:
json.dump(m, test_fn,ensure_ascii=False)
test_fn.write("\n")
test_fn.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment