Created
February 23, 2020 15:00
-
-
Save eugene87222/437942ac45dedf8cb8f6a2e350df6c91 to your computer and use it in GitHub Desktop.
Kaldi data preparation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
if __name__ == '__main__': | |
mode = ['test', 'train'] | |
if not os.path.exists('./data/local'): | |
os.mkdir('./data/local') | |
with open('./data/local/corpus.txt', 'w') as file: | |
for m in mode: | |
text_path = f'./data/{m}/text' | |
with open(text_path, 'r') as text_file: | |
text = text_file.readlines() | |
for line in text: | |
text = line.split(' ')[1:] | |
file.write(f"{' '.join(text)}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
if __name__ == '__main__': | |
digit2word = [ | |
'zero', 'one', 'two', 'three', 'four', | |
'five', 'six', 'seven', 'eight', 'nine' | |
] | |
mode = ['test', 'train'] | |
for m in mode: | |
audio_path = f'./digits_audio/{m}' | |
data_path = f'./data/{m}' | |
if not os.path.exists(data_path): | |
os.mkdir(data_path) | |
with open(f'{data_path}/text', 'w') as file: | |
for spkID in os.listdir(audio_path): | |
for audio in os.listdir(f'{audio_path}/{spkID}'): | |
utteranceID = audio[:-4] | |
text = ' '.join([digit2word[int(letter)] for letter in utteranceID.split('-')[1]]) | |
file.write(f'{utteranceID} {text}\n') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
if __name__ == '__main__': | |
mode = ['test', 'train'] | |
for m in mode: | |
audio_path = f'./digits_audio/{m}' | |
data_path = f'./data/{m}' | |
if not os.path.exists(data_path): | |
os.mkdir(data_path) | |
with open(f'{data_path}/utt2spk', 'w') as file: | |
for spkID in os.listdir(audio_path): | |
for audio in os.listdir(f'{audio_path}/{spkID}'): | |
utteranceID = audio[:-4] | |
file.write(f'{utteranceID} {spkID}\n') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
if __name__ == '__main__': | |
mode = ['test', 'train'] | |
for m in mode: | |
audio_path = f'./digits_audio/{m}' | |
data_path = f'./data/{m}' | |
if not os.path.exists(data_path): | |
os.mkdir(data_path) | |
with open(f'{data_path}/wav.scp', 'w') as file: | |
for spkID in os.listdir(audio_path): | |
for audio in os.listdir(f'{audio_path}/{spkID}'): | |
utteranceID = audio[:-4] | |
abspath = os.path.abspath(f'{audio_path}/{spkID}/{audio}') | |
file.write(f'{utteranceID} {abspath}\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment