Skip to content

Instantly share code, notes, and snippets.

@Qwlouse
Last active October 1, 2022 22:20
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save Qwlouse/3d33c8529f446b9fc5c0 to your computer and use it in GitHub Desktop.
Save Qwlouse/3d33c8529f446b9fc5c0 to your computer and use it in GitHub Desktop.
This notebook preprocesses the TIMIT dataset using MFCCs in the same way that the paper "LSTM: A Search Space Odyssey" used it.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prepare the TIMIT dataset\n",
"This notebook preprocesses the TIMIT dataset using MFCCs. It also provides the reduced version of TIMIT with only a core test set and the well known train/validation split from [Halberstadt1998]. \n",
"\n",
"### Dependencies\n",
"* numpy\n",
"* h5py\n",
"* scikits.audiolab (works only on python2) \n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/greff/venv/py2/local/lib/python2.7/site-packages/scikits/audiolab/soundio/play.py:48: UserWarning: Could not import alsa backend; most probably, you did not have alsa headers when building audiolab\n",
" warnings.warn(\"Could not import alsa backend; most probably, \"\n"
]
}
],
"source": [
"from __future__ import division, absolute_import, print_function, unicode_literals\n",
"from random import shuffle\n",
"import os\n",
"\n",
"import h5py\n",
"import numpy as np\n",
"import scikits.audiolab as al"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## MFCC Extraction"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"###############################################################################\n",
"# MFCC extraction\n",
"# By Maigo Yun Wang, 02/08/2012 adapted by Klaus Greff 2015\n",
"###############################################################################\n",
"\n",
"def melfb(p, n, fs):\n",
" \"\"\"\n",
" Return a Mel filterbank matrix as a numpy array.\n",
" Inputs:\n",
" p: number of filters in the filterbank\n",
" n: length of fft\n",
" fs: sample rate in Hz\n",
" Ref. www.ifp.illinois.edu/~minhdo/teaching/speaker_recognition/code/melfb.m\n",
" \"\"\"\n",
" f0 = 700.0 / fs\n",
" fn2 = int(np.floor(n/2))\n",
" lr = np.log(1 + 0.5/f0) / (p+1)\n",
" CF = fs * f0 * (np.exp(np.arange(1, p+1) * lr) - 1)\n",
" bl = n * f0 * (np.exp(np.array([0, 1, p, p+1]) * lr) - 1)\n",
" b1 = int(np.floor(bl[0])) + 1\n",
" b2 = int(np.ceil(bl[1]))\n",
" b3 = int(np.floor(bl[2]))\n",
" b4 = min(fn2, int(np.ceil(bl[3]))) - 1\n",
" pf = np.log(1 + np.arange(b1, b4+1) / f0 / n) / lr\n",
" fp = np.floor(pf)\n",
" pm = pf - fp\n",
" M = np.zeros((p, 1+fn2))\n",
" for c in range(b2-1, b4):\n",
" r = fp[c] - 1\n",
" M[int(r), c+1] += 2 * (1 - pm[c])\n",
" for c in range(b3):\n",
" r = fp[c]\n",
" M[int(r), c+1] += 2 * pm[c]\n",
" return M, CF\n",
"\n",
"def dctmtx(n):\n",
" \"\"\"\n",
" Return the DCT-II matrix of order n as a numpy array.\n",
" \"\"\"\n",
" x,y = np.meshgrid(range(n), range(n))\n",
" D = np.sqrt(2.0/n) * np.cos(np.pi * (2*x+1) * y / (2*n))\n",
" D[0] /= np.sqrt(2)\n",
" return D\n",
"\n",
"def extract(x):\n",
" \"\"\"\n",
" Extract MFCC coefficients of the sound x in numpy array format.\n",
" \"\"\"\n",
" FS = 16000 # Sampling rate\n",
" FRAME_LEN = int(0.025 * FS) # Frame length\n",
" FRAME_SHIFT = int(0.01 * FS) # Frame shift\n",
" FFT_SIZE = 2048 # How many points for FFT\n",
" WINDOW = np.hamming(FRAME_LEN) # Window function\n",
" PRE_EMPH = 0.97 # Pre-emphasis factor\n",
"\n",
" BANDS = 40 # Number of Mel filters\n",
" COEFS = 13 # Number of Mel cepstra coefficients to keep\n",
" POWER_SPECTRUM_FLOOR = 1e-100 # Flooring for the power to avoid log(0)\n",
" M, CF = melfb(BANDS, FFT_SIZE, FS) # The Mel filterbank matrix and the center frequencies of each band\n",
" D = dctmtx(BANDS)[0:COEFS] # The DCT matrix. Change the index to [0:COEFS] if you want to keep the 0-th coefficient\n",
" invD = np.linalg.inv(dctmtx(BANDS))[:, 0:COEFS] # The inverse DCT matrix. Change the index to [0:COEFS] if you want to keep the 0-th \n",
" \n",
" if x.ndim > 1:\n",
" print(\"INFO: Input signal has more than 1 channel; the channels will be averaged.\")\n",
" x = mean(x, axis=1)\n",
" frames = int((len(x) - FRAME_LEN) / FRAME_SHIFT + 1)\n",
" feature = []\n",
" for f in range(frames):\n",
" # Windowing\n",
" frame = x[f * FRAME_SHIFT : f * FRAME_SHIFT + FRAME_LEN] * WINDOW\n",
" # Pre-emphasis\n",
" frame[1:] -= frame[:-1] * PRE_EMPH\n",
" # Power spectrum\n",
" X = np.abs(np.fft.fft(frame, FFT_SIZE)[:FFT_SIZE/2+1]) ** 2\n",
" X[X < POWER_SPECTRUM_FLOOR] = POWER_SPECTRUM_FLOOR # Avoid zero\n",
" # Mel filtering, logarithm, DCT\n",
" X = np.dot(D, np.log(np.dot(M,X)))\n",
" feature.append(X)\n",
" feature = np.row_stack(feature)\n",
" return feature"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Configuration"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"TIMIT_DIR = '../timit'\n",
"#filename = 'timit.h5'\n",
"# no transformation\n",
"#DTYPE = np.float32\n",
"#extractor = lambda x: x.astype(DTYPE).reshape(-1, 1)\n",
"#frame_size=1\n",
"#frame_shift=1\n",
"#derivatives=0\n",
"#preprocessing_description = \"Only minimal preprocessing (normalizing to zero mean and unit standard deviation).\"\n",
"\n",
"# mfcc + 1st and 2nd deriv\n",
"filename = 'timit_mfcc.h5'\n",
"extractor = extract\n",
"frame_size=400\n",
"frame_shift=160\n",
"derivatives=2\n",
"DTYPE = np.float64\n",
"preprocessing_description = \"\"\"Extracted 12 MFCCs coefficients + energy with window size of 25ms and 10ms step. \n",
"Used hamming window and a pre-emphasis coefficient of 0.97.\n",
"Also included 1st and 2nd time-derivative of the signal for a total of 39 feature dimensions.\"\"\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Phones\n",
"Timit uses 61 phones (and erroneously calles them phonemes). But some tasks work with a reduced set of only 39 phones."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"phones = ['aa', 'ae', 'ah', 'ao', 'aw', 'ax', 'ax-h', 'axr', 'ay', 'b', 'bcl',\n",
" 'ch', 'd', 'dcl', 'dh', 'dx', 'eh', 'el', 'em', 'en', 'eng', 'epi',\n",
" 'er', 'ey', 'f', 'g', 'gcl', 'h#', 'hh', 'hv', 'ih', 'ix', 'iy',\n",
" 'jh', 'k', 'kcl', 'l', 'm', 'n', 'ng', 'nx', 'ow', 'oy', 'p', 'pau',\n",
" 'pcl', 'q', 'r', 's', 'sh', 't', 'tcl', 'th', 'uh', 'uw', 'ux', 'v',\n",
" 'w', 'y', 'z', 'zh']\n",
"silence_label = phones.index('h#')\n",
"\n",
"reduce_phones = {p: p for p in phones if p != 'q'} # discard q\n",
"reduce_phones.update({\n",
" 'ae': 'aa',\n",
" 'ax': 'ah', 'ax-h': 'ah',\n",
" 'axr': 'er',\n",
" 'hv': 'hh',\n",
" 'ix': 'ih',\n",
" 'el': 'l',\n",
" 'em': 'm',\n",
" 'en': 'n', 'nx': 'n',\n",
" 'eng': 'ng',\n",
" 'zh': 'sh',\n",
" 'pcl': 'h#', 'tcl': 'h#', 'kcl': 'h#', 'bcl': 'h#', 'dcl': 'h#', 'gcl': 'h#', 'pau': 'h#', 'epi': 'h#',\n",
" 'ux': 'uw'\n",
"})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## TIMIT Sample Class\n",
"We first write a small class that captures and extracts all important information about a single TIMIT sequence."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"class TimitSample(object):\n",
" @classmethod\n",
" def create(cls, directory, name):\n",
" f = os.path.join(directory, name.split('.')[0])\n",
" f = f.split('/')[-4:]\n",
" sample = cls(f[0], f[1], f[2][0], f[2][1:], f[3])\n",
" return sample\n",
"\n",
" def __init__(self, usage, dialect, sex, speaker_id, sentence_id,\n",
" start=None, stop=None):\n",
" self.usage = usage\n",
" self.dialect = dialect\n",
" self.sex = sex\n",
" self.speaker_id = speaker_id\n",
" self.sentence_id = sentence_id\n",
" self.start = start\n",
" self.stop = stop\n",
"\n",
" def _get_path(self, fileending):\n",
" if not fileending.startswith('.'):\n",
" fileending = '.' + fileending\n",
" return os.path.join(TIMIT_DIR, self.usage, self.dialect, self.sex +\n",
" self.speaker_id, self.sentence_id + fileending)\n",
"\n",
" def get_sentence(self):\n",
" filename = self._get_path('txt')\n",
" with file(filename, 'r') as f:\n",
" content = f.read()\n",
" start, stop, sentence = content.split(' ', 2)\n",
" return int(start), int(stop), sentence.strip()\n",
"\n",
" def get_words(self):\n",
" filename = self._get_path('wrd')\n",
" with file(filename, 'r') as f:\n",
" content = f.readlines()\n",
" wordlist = [c.strip().split(' ', 2) for c in content]\n",
" return [(int(start), int(stop), word)\n",
" for start, stop, word in wordlist\n",
" if (self.start is None or int(start) >= self.start) and\n",
" (self.stop is None or int(stop) <= self.stop)]\n",
"\n",
" def get_phones(self):\n",
" filename = self._get_path('phn')\n",
" with file(filename, 'r') as f:\n",
" content = f.readlines()\n",
" phone_list = [c.strip().split(' ', 2) for c in content]\n",
" return [(int(start), int(stop), phone, phones.index(phone))\n",
" for start, stop, phone in phone_list\n",
" if (self.start is None or int(start) >= self.start) and\n",
" (self.stop is None or int(stop) <= self.stop)]\n",
"\n",
" def get_audio_data(self):\n",
" filename = os.path.join(TIMIT_DIR, self.usage, self.dialect,\n",
" self.sex + self.speaker_id,\n",
" self.sentence_id + '.wav')\n",
" f = al.Sndfile(filename, 'r')\n",
" data = f.read_frames(f.nframes, dtype=np.float64)\n",
" return data[self.start:self.stop]\n",
"\n",
" def get_labels(self, frame_size=1, frame_shift=1):\n",
" phones = self.get_phones()\n",
" begin = self.start if self.start else 0\n",
" p_extended = [silence_label] * (phones[0][0] - begin)\n",
" for p in phones:\n",
" p_extended += [p[3]] * (int(p[1]) - int(p[0]))\n",
" end = phones[-1][1]\n",
" windows = zip(range(0, end - begin - frame_size + 1, frame_shift),\n",
" range(frame_size, end - begin + 1, frame_shift))\n",
" labels = [np.bincount(p_extended[w[0]:w[1]]).argmax() for w in windows]\n",
" return np.array(labels, dtype=np.byte)\n",
"\n",
" def get_features(self, extractor, frame_size=1, frame_shift=1, derivatives=0):\n",
" d = self.get_audio_data()\n",
" features = extractor(d)\n",
"\n",
" feature_derivs = [features]\n",
" for i in range(derivatives):\n",
" feature_derivs.append(np.gradient(feature_derivs[-1])[0])\n",
"\n",
" all_features = np.hstack(feature_derivs)\n",
" labels = self.get_labels(frame_size, frame_shift)\n",
" return all_features, labels\n",
"\n",
" def __unicode__(self):\n",
" return '<TimitSample ' + '/'.join([self.usage, self.dialect,\n",
" self.sex + self.speaker_id,\n",
" self.sentence_id]) + '>'\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def read_all_samples():\n",
" samples = []\n",
" for dirname, dirnames, filenames in os.walk(TIMIT_DIR):\n",
" samples += [TimitSample.create(dirname, n)\n",
" for n in filenames if n.endswith('.wav')]\n",
" return samples\n",
"\n",
"def filter_samples(samples, usage=None, dialect=None, sex=None, speaker_id=None,\n",
" sentence_id=None):\n",
" def match(s):\n",
" return (usage is None or s.usage == usage) and \\\n",
" (dialect is None or s.dialect == dialect) and \\\n",
" (sex is None or s.sex == sex) and \\\n",
" (speaker_id is None or s.speaker_id == speaker_id) and \\\n",
" (sentence_id is None or s.sentence_id == sentence_id)\n",
" return [s for s in samples if match(s)]\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Extract features"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def get_features_and_labels_for(samples):\n",
" ds_list = [s.get_features(extractor, derivatives=derivatives, frame_size=frame_size, frame_shift=frame_shift)\n",
" for s in samples]\n",
" \n",
" maxlen = max(f.shape[0] for f, l in ds_list)\n",
" padded_features = []\n",
" padded_labels = []\n",
" masks = []\n",
" for f, l in ds_list:\n",
" pad_length_f = maxlen - f.shape[0]\n",
" pad_length_l = maxlen - l.shape[0]\n",
"\n",
" mask = np.ones_like(l)\n",
" padded_features.append(np.vstack((f, np.zeros((pad_length_f, f.shape[1]), dtype=DTYPE))))\n",
" padded_labels.append(np.hstack((l, np.ones(pad_length_l, dtype=DTYPE) * silence_label)))\n",
" masks.append(np.hstack((mask, np.zeros(pad_length_l, dtype=DTYPE))))\n",
"\n",
" features = np.dstack(padded_features).swapaxes(1, 2)\n",
" labels = np.vstack(padded_labels).T.reshape(maxlen, -1, 1)\n",
" masks = np.vstack(masks).T.reshape(maxlen, -1, 1)\n",
" return features, labels, masks"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def get_padded_labels(samples, reduced=False):\n",
" if not reduced:\n",
" L = [zip(*l.get_phones())[3] for l in samples]\n",
" else:\n",
" L_tmp = [zip(*l.get_phones())[2] for l in samples]\n",
" L = [[reduced_phones.index(reduce_phones[p]) for p in l if p != 'q'] for l in L_tmp]\n",
" \n",
" L_len = max([len(l) for l in L])\n",
" L_padded = -np.ones([L_len, len(L), 1], dtype=np.byte)\n",
" for i, l in enumerate(L):\n",
" L_padded[:len(l), i, 0] = l\n",
" return L_padded"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Normalization"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def get_means(input_data, mask=None):\n",
" \"\"\"\n",
" Get the mean values for every feature in the batch of sequences X by\n",
" considering only masked-in entries.\n",
" @param input_data: Batch of sequences. shape = (time, sample, feature)\n",
" @param mask: Optional mask for the sequences. shape = (time, sample, 1)\n",
" @return: mean value for each feature. shape = (features, )\n",
" \"\"\"\n",
" if mask is not None:\n",
" return input_data[:, :, :].reshape(-1, input_data.shape[2])[\n",
" mask.flatten() == 1].mean(0)\n",
" else:\n",
" return input_data[:, :, :].mean((0, 1))\n",
"\n",
"\n",
"def get_stds(input_data, mask=None, channel_mask=None):\n",
" \"\"\"\n",
" Get the standard deviation for every feature in the batch of sequences X by\n",
" considering only masked-in entries.\n",
" @param input_data: Batch of sequences. shape = (time, sample, feature)\n",
" @param mask: Optional mask for the sequences. shape = (time, sample, 1)\n",
" @return: standard deviation of each feature. shape = (features, )\n",
" \"\"\"\n",
" if mask is not None:\n",
" return input_data[:, :, :].reshape(-1, input_data.shape[2])[\n",
" mask.flatten() == 1].std(0)\n",
" else:\n",
" return input_data[:, :, :].std((0, 1))\n",
"\n",
"\n",
"def subtract_means(input_data, means, mask=None):\n",
" \"\"\"\n",
" Subtract the means from the masked-in entries of a batch of sequences X.\n",
" This operation is performed in-place, i.e. the input_data will be modified.\n",
"\n",
" @param input_data: Batch of sequences. shape = (time, sample, feature)\n",
" @param means: The means to subtract. shape = (features, )\n",
" @param mask: Optional mask for the sequences. shape = (time, sample, 1)\n",
" @param channel_mask: Optional mask for the channels. shape = (feature,)\n",
" \"\"\"\n",
" if mask is not None:\n",
" j = 0\n",
" for i in range(input_data.shape[2]):\n",
" input_data[:, :, i][mask[:, :, 0] == 1] -= means[j]\n",
" j += 1\n",
" else:\n",
" input_data[:, :, :] -= means\n",
"\n",
"\n",
"def divide_by_stds(input_data, stds, mask=None):\n",
" \"\"\"\n",
" Divide masked-in entries of input_data by the stds.\n",
"\n",
" @param input_data: Batch of sequences. shape = (time, sample, feature)\n",
" @param stds: The standard deviations for every feature. shape = (features, )\n",
" @param mask: Optional mask for the sequences. shape = (time, sample, 1)\n",
" \"\"\"\n",
" if mask is not None:\n",
" j = 0\n",
" for i in range(input_data.shape[2]):\n",
" input_data[:, :, i][mask[:, :, 0] == 1] /= stds[j]\n",
" j += 1\n",
" else:\n",
" input_data[:, :, :] /= stds\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Playin around"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"all_samples = read_all_samples()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"samples = all_samples[:10]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/greff/venv/py2/lib/python2.7/site-packages/ipykernel/__main__.py:74: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n"
]
}
],
"source": [
"X, T, M = get_features_and_labels_for(samples)\n",
"L = get_padded_labels(samples)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"means = get_means(X, M)\n",
"subtract_means(X, means, M)\n",
"stds = get_stds(X, M)\n",
"divide_by_stds(X, stds, M)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([ -1.36270510e-01, -2.64865545e-02, -1.88458491e-03,\n",
" -5.35061361e-03, -4.14322723e-04, -1.62373580e-03,\n",
" -4.65313835e-04, 1.66968832e-04, 1.34837504e-03,\n",
" 2.84097840e-04, 3.70963168e-04, 9.10096640e-05,\n",
" -4.71527965e-05, -1.85187170e-03, -1.06796464e-03,\n",
" 1.74962193e-03, 7.97651630e-04, 6.28126471e-04,\n",
" -1.48645415e-03, -6.65244577e-04, -3.86702196e-04,\n",
" -1.08519194e-04, 1.22560158e-03, 6.08361830e-04,\n",
" 8.05459867e-04, -6.19838093e-04, -3.79369618e-04,\n",
" -7.56204580e-04, 6.21650029e-04, 9.32087564e-05,\n",
" 7.47136741e-04, -3.67511093e-04, -2.92121960e-04,\n",
" -1.71338698e-04, -1.18740905e-04, 3.18499112e-04,\n",
" 2.22929043e-04, 5.36611170e-04, -8.22041047e-04])"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X.mean((0, 1)) # checking the mean"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.9473095372851984"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X.reshape(-1, 1)[M.flatten() == 1].std() # checking the variance"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Preparing the full(original) TIMIT dataset"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"with h5py.File(filename, 'w') as f:\n",
" orig = f.create_group('original')\n",
" orig.attrs['description'] = \"\"\"\n",
" TIMIT\n",
" =====\n",
" \n",
" This is the original TIMIT dataset.\n",
" \n",
" Preprocessing\n",
" -------------\n",
" {}\n",
" \n",
" Content\n",
" -------\n",
" default: All audio data padded to be of equal length\n",
" targets: Phone index for each frame (same shape as default)\n",
" masks: Binary array indicating for each frame whether it is part of a sequence (1) or just padding (0)\n",
" labels: Integer array with all the phone indices for a labelling task (not framewise). Padded with -1\n",
" names: list of filenames in the original dataset for each sample\n",
" \"\"\".format(preprocessing_description)\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### training data"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"4620"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_samples = filter_samples(all_samples, usage='train')\n",
"len(train_samples)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"X, T, M = get_features_and_labels_for(train_samples)\n",
"L = get_padded_labels(train_samples)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"means = get_means(X, M)\n",
"subtract_means(X, means, M)\n",
"\n",
"stds = get_stds(X, M)\n",
"divide_by_stds(X, stds, M)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"shuffling = range(len(train_samples))\n",
"shuffle(shuffling)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"X = X[:, shuffling, :]\n",
"T = T[:, shuffling, :]\n",
"M = M[:, shuffling, :]\n",
"L = L[:, shuffling, :]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"timit_train_names = [train_samples[i]._get_path(\".txt\").encode() for i in shuffling]"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"with h5py.File(filename, 'a') as f:\n",
" orig = f['original']\n",
" train = orig.create_group('training')\n",
" train.create_dataset('default', data=X, compression='gzip', chunks=(X.shape[0], 1, X.shape[2]))\n",
" train.create_dataset('targets', data=T, compression='gzip', chunks=(T.shape[0], 1, T.shape[2]))\n",
" train.create_dataset('masks', data=M, compression='gzip', chunks=(M.shape[0], 1, M.shape[2]))\n",
" train.create_dataset('labels', data=L)\n",
" train.create_dataset('names', data=np.array(timit_train_names))"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"del X, T, M, L"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### test data"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"1680"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_samples = filter_samples(all_samples, usage='test')\n",
"len(test_samples)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"X_test, T_test, M_test = get_features_and_labels_for(test_samples)\n",
"L_test = get_padded_labels(test_samples)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"subtract_means(X_test, means, M_test)\n",
"divide_by_stds(X_test, stds, M_test)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"timit_test_names = [x._get_path(\".txt\").encode() for x in test_samples]"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"with h5py.File(filename, 'a') as f:\n",
" orig = f['original']\n",
" test = orig.create_group('test')\n",
" test.create_dataset('default', data=X_test, compression='gzip', chunks=(X_test.shape[0], 1, X_test.shape[2]))\n",
" test.create_dataset('targets', data=T_test, compression='gzip', chunks=(T_test.shape[0], 1, T_test.shape[2]))\n",
" test.create_dataset('masks', data=M_test, compression='gzip', chunks=(M_test.shape[0], 1, M_test.shape[2]))\n",
" test.create_dataset('labels', data=L_test)\n",
" test.create_dataset('names', data=np.array(timit_test_names))\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# The reduced Timit Dataset\n",
"\n",
"see Phd Thesis of Andrew K. Halberstadt 1998"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"with h5py.File(filename, 'a') as f: \n",
"#set 'a' to 'w' if you didn't prepare the full (original) TIMIT \n",
" orig = f.create_group('reduced')\n",
" orig.attrs['description'] = \"\"\"\n",
" TIMIT Reduced\n",
" =============\n",
" \n",
" This is the reduced TIMIT dataset. \n",
" It only uses a core test set of 24 speakers, discards all the SA samples from training and has a fixed validation set.\n",
" (For details see the PhD Thesis of Andrew K. Halberstadt 1998.)\n",
" \n",
" Preprocessing\n",
" -------------\n",
" {}\n",
" \n",
" Content\n",
" -------\n",
" default: All audio data padded to be of equal length\n",
" targets: Phone index for each frame (same shape as default)\n",
" masks: Binary array indicating for each frame whether it is part of a sequence (1) or just padding (0)\n",
" labels: Integer array with all the phone indices for a labelling task (not framewise). Padded with -1\n",
" labels_reduced: Integer array with all the phones mapped to the reduced phone set. (like labels)\n",
" names: list of filenames in the original dataset for each sample\n",
" \"\"\".format(preprocessing_description)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"24"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"core_test_speakers={\"mdab0\", \"mwbt0\", \"felc0\", \"mtas1\", \"mwew0\", \"fpas0\",\n",
" \"mjmp0\", \"mlnt0\", \"fpkt0\", \"mlll0\", \"mtls0\", \"fjlm0\",\n",
" \"mbpm0\", \"mklt0\", \"fnlp0\", \"mcmj0\", \"mjdh0\", \"fmgd0\",\n",
" \"mgrt0\", \"mnjm0\", \"fdhc0\", \"mjln0\", \"mpam0\", \"fmld0\"}\n",
"len(core_test_speakers) # should be 24"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"192"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"core_test_set= [x for x in all_samples if x.sex + x.speaker_id in core_test_speakers and not x.sentence_id.startswith('sa')]\n",
"len(core_test_set) # should be 192"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"462"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len({x.speaker_id for x in all_samples if x.usage=='train'}) # should be 462"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"3696"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_set = [x for x in all_samples if x.usage=='train' and not x.sentence_id.startswith('sa')]\n",
"len(train_set) # should be 3696"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"168"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len({x.speaker_id for x in all_samples if x.usage=='test'}) # should be 168"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"50"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"val_set_speakers={'faks0', 'mmdb1', 'mbdg0', 'fedw0', 'mtdt0', 'fsem0', 'mdvc0', 'mrjm4', 'mjsw0', 'mteb0',\n",
" 'fdac1', 'fjem0', 'mgwt0', 'mmdm2', 'mpdf0', 'fcmh0', 'mbwm0', 'mcsh0', 'fadg0', 'mgjf0',\n",
" 'mglb0', 'mrtk0', 'mthc0', 'mwjg0', 'fnmr0', 'mbns0', 'mmjr0', 'mdls0', 'mers0', 'fmah0',\n",
" 'fdrw0', 'fcal1', 'mmwh0', 'fjsj0', 'mreb0', 'fgjd0', 'fjmg0', 'mjfc0', 'mrjr0', 'fmml0',\n",
" 'mjar0', 'fkms0', 'fdms0', 'mtaa0', 'frew0', 'mdlf0', 'mrcs0', 'majc0', 'mroa0', 'mrws1'}\n",
"len(val_set_speakers) # should be 50"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"400"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"val_set= [x for x in all_samples if x.sex + x.speaker_id in val_set_speakers and not x.sentence_id.startswith('sa')]\n",
"len(val_set) # should be 400"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"39"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"reduced_phones = sorted({reduce_phones[p] for p in phones if p != 'q'})\n",
"len(reduced_phones) # should be 39"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Preparing the training set"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"X_train, T_train, M_train = get_features_and_labels_for(train_set)\n",
"L_train = get_padded_labels(train_set)\n",
"L_train_reduced = get_padded_labels(train_set, reduced=True)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"means = get_means(X_train, M_train)\n",
"subtract_means(X_train, means, M_train)\n",
"\n",
"stds = get_stds(X_train, M_train)\n",
"divide_by_stds(X_train, stds, M_train)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"shuffling = range(len(train_set))\n",
"shuffle(shuffling)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"X_train = X_train[:, shuffling, :]\n",
"T_train = T_train[:, shuffling, :]\n",
"M_train = M_train[:, shuffling, :]\n",
"L_train = L_train[:, shuffling, :]\n",
"L_train_reduced = L_train_reduced[:, shuffling, :]"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"rtimit_train_names = [train_set[i]._get_path(\".txt\").encode() for i in shuffling]"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"with h5py.File(filename, 'a') as f:\n",
" orig = f['reduced']\n",
" train = orig.create_group('training')\n",
" train.create_dataset('default', data=X_train, compression='gzip', chunks=(X_train.shape[0], 1, X_train.shape[2]))\n",
" train.create_dataset('targets', data=T_train, compression='gzip', chunks=(T_train.shape[0], 1, T_train.shape[2]))\n",
" train.create_dataset('masks', data=M_train, compression='gzip', chunks=(M_train.shape[0], 1, M_train.shape[2]))\n",
" train.create_dataset('labels', data=L_train)\n",
" train.create_dataset('labels_reduced', data=L_train_reduced)\n",
" train.create_dataset('names', data=np.array(rtimit_train_names))"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"del X_train, T_train, M_train, L_train, L_train_reduced"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Preparing the validation set"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"X_val, T_val, M_val = get_features_and_labels_for(val_set)\n"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"subtract_means(X_val, means, M_val)\n",
"divide_by_stds(X_val, stds, M_val)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"L_val = get_padded_labels(val_set)\n",
"L_val_reduced = get_padded_labels(val_set, reduced=True)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"rtimit_val_names = [val_set[i]._get_path(\".txt\").encode() for i in range(len(val_set))]"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"with h5py.File(filename, 'a') as f:\n",
" orig = f['reduced']\n",
" validation = orig.create_group('validation')\n",
" validation.create_dataset('default', data=X_val, compression='gzip', chunks=(X_val.shape[0], 1, X_val.shape[2]))\n",
" validation.create_dataset('targets', data=T_val, compression='gzip', chunks=(T_val.shape[0], 1, T_val.shape[2]))\n",
" validation.create_dataset('masks', data=M_val, compression='gzip', chunks=(M_val.shape[0], 1, M_val.shape[2]))\n",
" validation.create_dataset('labels', data=L_val)\n",
" validation.create_dataset('labels_reduced', data=L_val_reduced)\n",
" validation.create_dataset('names', data=np.array(rtimit_val_names))"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"del X_val, T_val, M_val, L_val, L_val_reduced"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## preparing the core test set"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"X_test, T_test, M_test = get_features_and_labels_for(core_test_set)\n",
"L_test = get_padded_labels(core_test_set)\n",
"L_test_reduced = get_padded_labels(core_test_set, reduced=True)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"subtract_means(X_test, means, M_test)\n",
"divide_by_stds(X_test, stds, M_test)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"rtimit_test_names = [core_test_set[i]._get_path(\".txt\").encode() for i in range(len(core_test_set))]"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"with h5py.File(filename, 'a') as f:\n",
" orig = f['reduced']\n",
" validation = orig.create_group('test')\n",
" validation.create_dataset('default', data=X_test, compression='gzip', chunks=(X_test.shape[0], 1, X_test.shape[2]))\n",
" validation.create_dataset('targets', data=T_test, compression='gzip', chunks=(T_test.shape[0], 1, T_test.shape[2]))\n",
" validation.create_dataset('masks', data=M_test, compression='gzip', chunks=(M_test.shape[0], 1, M_test.shape[2]))\n",
" validation.create_dataset('labels', data=L_test)\n",
" validation.create_dataset('labels_reduced', data=L_test_reduced)\n",
" validation.create_dataset('names', data=np.array(rtimit_test_names))"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"del X_test, T_test, M_test, L_test, L_test_reduced"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
@gevangelopoulos
Copy link

Hi, I thought I could submit a pull request because I fixed two small problems that occur if someone tries to prepare the reduced TIMIT without preparing the original first, but currently, it's impossible to pull on gists.

In 43, I think the last line should have
train.create_dataset('names', data=np.array(rtimit_train_names))
instead of
train.create_dataset('names', data=np.array(timit_train_names))

and in 29, I added a comment
#set 'a' to 'w' if you didn't prepare the full (original) TIMIT

You can see the changes in my fork.
Cheers,
George

@Qwlouse
Copy link
Author

Qwlouse commented Apr 12, 2016

Hi George, I merged your changes.
Thanks a lot!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment