Skip to content

Instantly share code, notes, and snippets.

@sotelo
Created January 12, 2017 20:23
Show Gist options
  • Save sotelo/b6a9209b8cd5931a49689d3f1e1af514 to your computer and use it in GitHub Desktop.
Save sotelo/b6a9209b8cd5931a49689d3f1e1af514 to your computer and use it in GitHub Desktop.
pavoque to hdf5
from run_merlin import prepare_file_path_list, read_file_list
from io_funcs.binary_io import BinaryIOCollection
import numpy
import h5py
import pickle
from fuel.datasets.hdf5 import H5PYDataset
io_fun = BinaryIOCollection()
n_outs = 63 # 187
save_dir = '/Tmp/sotelo/data/pavoque/'
base_dir = '/Tmp/sotelo/results/merlin/egs/build_your_own_voice/s1/'
merlin_data_dir = base_dir + \
'experiments/pavoque/acoustic_model/data/'
file_list = read_file_list(merlin_data_dir + 'file_id_list.scp')
emotion_set = [x.split('-')[1] for x in file_list]
emotion_set = sorted(list(set(emotion_set)))
emotion_dict = {x: i + 1 for i, x in enumerate(emotion_set)}
# File before merlin
raw_dir = base_dir + 'raw_data/pavoque/'
text_file = raw_dir + 'utts.data'
with open(text_file) as f:
text_data = f.readlines()
id_from_text = [x.split()[1] for x in text_data]
error_files = [
(i, x) for i, x in enumerate(id_from_text) if x not in file_list]
assert id_from_text == file_list
text_data = ['"'.join(x.strip().split('"')[1:-1]) for x in text_data]
char_set = sorted(list(set(''.join(text_data).lower())))
char2code = {x: i + 1 for i, x in enumerate(char_set)}
with open(save_dir + 'char2code.pkl', 'w') as f:
pickle.dump(char2code, f)
audio_files = prepare_file_path_list(
file_list, merlin_data_dir + 'nn_norm_mgc_lf0_vuv_bap_63', '.cmp')
resulth5 = h5py.File(
'/Tmp/sotelo/data/pavoque/pavoque.hdf5', mode='w')
num_files = len(file_list)
features_h5 = resulth5.create_dataset(
'features', (num_files,),
dtype=h5py.special_dtype(vlen=numpy.dtype('float32')))
features_shape_h5 = resulth5.create_dataset(
'features_shapes', (num_files, 2), dtype='int32')
features_h5.dims.create_scale(features_shape_h5, 'shapes')
features_h5.dims[0].attach_scale(features_shape_h5)
features_shape_labels = resulth5.create_dataset(
'features_shape_labels', (2,), dtype='S7')
features_shape_labels[...] = [
'time_step'.encode('utf8'),
'num_feature'.encode('utf8')]
features_h5.dims.create_scale(
features_shape_labels, 'shape_labels')
features_h5.dims[0].attach_scale(features_shape_labels)
text_h5 = resulth5.create_dataset(
'text', (num_files,),
dtype=h5py.special_dtype(vlen=numpy.dtype('int32')))
speaker_index_h5 = resulth5.create_dataset(
'speaker_index', (num_files, 1), dtype='uint8')
order = range(num_files)
numpy.random.seed(1)
numpy.random.shuffle(order)
for i, idx in enumerate(order):
if i % 100 == 0:
print i
out_features, out_frame_number = io_fun.load_binary_file_frame(
audio_files[idx], n_outs)
features_h5[i] = out_features.flatten()
features_shape_h5[i] = numpy.array(out_features.shape)
speaker_label = id_from_text[idx].split('-')[1]
speaker_index_h5[i] = emotion_dict[speaker_label]
text_h5[i] = numpy.array(
[char2code[x.lower()] for x in text_data[idx]], dtype='int32')
end_train = int(.9 * num_files)
end_valid = int(.95 * num_files)
end_test = num_files
split_dict = {
'train': {'features': (0, end_train),
'text': (0, end_train),
'speaker_index': (0, end_train)},
'valid': {'features': (end_train, end_valid),
'text': (end_train, end_valid),
'speaker_index': (end_train, end_valid)},
'test': {'features': (end_valid, end_test),
'text': (end_valid, end_test),
'speaker_index': (end_valid, end_test)}}
resulth5.attrs['split'] = H5PYDataset.create_split_array(split_dict)
resulth5.flush()
resulth5.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment