Skip to content

Instantly share code, notes, and snippets.

@sotelo
Created January 12, 2017 18:16
Show Gist options
  • Save sotelo/0745eb639dc9e5b22b83fbf0ef749ff5 to your computer and use it in GitHub Desktop.
Save sotelo/0745eb639dc9e5b22b83fbf0ef749ff5 to your computer and use it in GitHub Desktop.
process pavoque labels
import os
import glob
from shutil import copyfile
base_dir = '/Tmp/sotelo/data/german/raw'
text_dir = os.path.join(base_dir, 'text')
wav_dir = os.path.join(base_dir, 'wav/')
audio_files = [
os.path.join(root, name)
for root, dirs, files in os.walk(wav_dir)
for name in files
if name.endswith((".wav"))]
audio_files = sorted(audio_files)
text_files = [
os.path.join(root, name)
for root, dirs, files in os.walk(text_dir)
for name in files
if name.endswith((".txt"))]
text_files = sorted(text_files)
audio_names = [x.strip().split('/')[-1].split('.')[0] for x in audio_files]
text_names = [x.strip().split('/')[-1].split('.')[0] for x in text_files]
audio_names = [x.split('-') for x in audio_names]
audio_names, audio_emotions = zip(*audio_names)
all_text = {}
for text_file, text_name in zip(text_files, text_names):
with open(text_file, 'r') as f:
text_data = f.readlines()
assert len(text_data) == 1
all_text[text_name] = text_data[0]
proc_txt = []
for audio_file in audio_files:
audio_name = audio_file.strip().split('/')[-1].split('.')[0]
audio_code, audio_emotion = audio_name.split('-')
text = all_text[audio_code]
proc_txt.append('( ' + audio_name + ' "' + text + '" )\n')
with open(os.path.join(base_dir, 'utts.data'), 'w') as f:
f.writelines(proc_txt)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment