Skip to content

Instantly share code, notes, and snippets.

@eugene87222
Created February 23, 2020 15:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eugene87222/437942ac45dedf8cb8f6a2e350df6c91 to your computer and use it in GitHub Desktop.
Save eugene87222/437942ac45dedf8cb8f6a2e350df6c91 to your computer and use it in GitHub Desktop.
Kaldi data preparation
import os
if __name__ == '__main__':
mode = ['test', 'train']
if not os.path.exists('./data/local'):
os.mkdir('./data/local')
with open('./data/local/corpus.txt', 'w') as file:
for m in mode:
text_path = f'./data/{m}/text'
with open(text_path, 'r') as text_file:
text = text_file.readlines()
for line in text:
text = line.split(' ')[1:]
file.write(f"{' '.join(text)}")
import os
if __name__ == '__main__':
digit2word = [
'zero', 'one', 'two', 'three', 'four',
'five', 'six', 'seven', 'eight', 'nine'
]
mode = ['test', 'train']
for m in mode:
audio_path = f'./digits_audio/{m}'
data_path = f'./data/{m}'
if not os.path.exists(data_path):
os.mkdir(data_path)
with open(f'{data_path}/text', 'w') as file:
for spkID in os.listdir(audio_path):
for audio in os.listdir(f'{audio_path}/{spkID}'):
utteranceID = audio[:-4]
text = ' '.join([digit2word[int(letter)] for letter in utteranceID.split('-')[1]])
file.write(f'{utteranceID} {text}\n')
import os
if __name__ == '__main__':
mode = ['test', 'train']
for m in mode:
audio_path = f'./digits_audio/{m}'
data_path = f'./data/{m}'
if not os.path.exists(data_path):
os.mkdir(data_path)
with open(f'{data_path}/utt2spk', 'w') as file:
for spkID in os.listdir(audio_path):
for audio in os.listdir(f'{audio_path}/{spkID}'):
utteranceID = audio[:-4]
file.write(f'{utteranceID} {spkID}\n')
import os
if __name__ == '__main__':
mode = ['test', 'train']
for m in mode:
audio_path = f'./digits_audio/{m}'
data_path = f'./data/{m}'
if not os.path.exists(data_path):
os.mkdir(data_path)
with open(f'{data_path}/wav.scp', 'w') as file:
for spkID in os.listdir(audio_path):
for audio in os.listdir(f'{audio_path}/{spkID}'):
utteranceID = audio[:-4]
abspath = os.path.abspath(f'{audio_path}/{spkID}/{audio}')
file.write(f'{utteranceID} {abspath}\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment