Qwlouse/Prepare_TIMIT.ipynb

## Prepare_TIMIT.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prepare the TIMIT dataset\n",
    "This notebook preprocesses the TIMIT dataset using MFCCs. It also provides the reduced version of TIMIT with only a core test set and the well known train/validation split from [Halberstadt1998]. \n",
    "\n",
    "### Dependencies\n",
    "* numpy\n",
    "* h5py\n",
    "* scikits.audiolab (works only on python2) \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/greff/venv/py2/local/lib/python2.7/site-packages/scikits/audiolab/soundio/play.py:48: UserWarning: Could not import alsa backend; most probably, you did not have alsa headers when building audiolab\n",
      "  warnings.warn(\"Could not import alsa backend; most probably, \"\n"
     ]
    }
   ],
   "source": [
    "from __future__ import division, absolute_import, print_function, unicode_literals\n",
    "from random import shuffle\n",
    "import os\n",
    "\n",
    "import h5py\n",
    "import numpy as np\n",
    "import scikits.audiolab as al"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## MFCC Extraction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "###############################################################################\n",
    "#   MFCC extraction\n",
    "#   By Maigo Yun Wang, 02/08/2012 adapted by Klaus Greff 2015\n",
    "###############################################################################\n",
    "\n",
    "def melfb(p, n, fs):\n",
    "    \"\"\"\n",
    "    Return a Mel filterbank matrix as a numpy array.\n",
    "    Inputs:\n",
    "        p:  number of filters in the filterbank\n",
    "        n:  length of fft\n",
    "        fs: sample rate in Hz\n",
    "    Ref. www.ifp.illinois.edu/~minhdo/teaching/speaker_recognition/code/melfb.m\n",
    "    \"\"\"\n",
    "    f0 = 700.0 / fs\n",
    "    fn2 = int(np.floor(n/2))\n",
    "    lr = np.log(1 + 0.5/f0) / (p+1)\n",
    "    CF = fs * f0 * (np.exp(np.arange(1, p+1) * lr) - 1)\n",
    "    bl = n * f0 * (np.exp(np.array([0, 1, p, p+1]) * lr) - 1)\n",
    "    b1 = int(np.floor(bl[0])) + 1\n",
    "    b2 = int(np.ceil(bl[1]))\n",
    "    b3 = int(np.floor(bl[2]))\n",
    "    b4 = min(fn2, int(np.ceil(bl[3]))) - 1\n",
    "    pf = np.log(1 + np.arange(b1, b4+1) / f0 / n) / lr\n",
    "    fp = np.floor(pf)\n",
    "    pm = pf - fp\n",
    "    M = np.zeros((p, 1+fn2))\n",
    "    for c in range(b2-1, b4):\n",
    "        r = fp[c] - 1\n",
    "        M[int(r), c+1] += 2 * (1 - pm[c])\n",
    "    for c in range(b3):\n",
    "        r = fp[c]\n",
    "        M[int(r), c+1] += 2 * pm[c]\n",
    "    return M, CF\n",
    "\n",
    "def dctmtx(n):\n",
    "    \"\"\"\n",
    "    Return the DCT-II matrix of order n as a numpy array.\n",
    "    \"\"\"\n",
    "    x,y = np.meshgrid(range(n), range(n))\n",
    "    D = np.sqrt(2.0/n) * np.cos(np.pi * (2*x+1) * y / (2*n))\n",
    "    D[0] /= np.sqrt(2)\n",
    "    return D\n",
    "\n",
    "def extract(x):\n",
    "    \"\"\"\n",
    "    Extract MFCC coefficients of the sound x in numpy array format.\n",
    "    \"\"\"\n",
    "    FS = 16000                               # Sampling rate\n",
    "    FRAME_LEN = int(0.025 * FS)              # Frame length\n",
    "    FRAME_SHIFT = int(0.01 * FS)             # Frame shift\n",
    "    FFT_SIZE = 2048                          # How many points for FFT\n",
    "    WINDOW = np.hamming(FRAME_LEN)           # Window function\n",
    "    PRE_EMPH = 0.97                          # Pre-emphasis factor\n",
    "\n",
    "    BANDS = 40                               # Number of Mel filters\n",
    "    COEFS = 13                               # Number of Mel cepstra coefficients to keep\n",
    "    POWER_SPECTRUM_FLOOR = 1e-100            # Flooring for the power to avoid log(0)\n",
    "    M, CF = melfb(BANDS, FFT_SIZE, FS)       # The Mel filterbank matrix and the center frequencies of each band\n",
    "    D = dctmtx(BANDS)[0:COEFS]               # The DCT matrix. Change the index to [0:COEFS] if you want to keep the 0-th coefficient\n",
    "    invD = np.linalg.inv(dctmtx(BANDS))[:, 0:COEFS]    # The inverse DCT matrix. Change the index to [0:COEFS] if you want to keep the 0-th \n",
    "    \n",
    "    if x.ndim > 1:\n",
    "        print(\"INFO: Input signal has more than 1 channel; the channels will be averaged.\")\n",
    "        x = mean(x, axis=1)\n",
    "    frames = int((len(x) - FRAME_LEN) / FRAME_SHIFT + 1)\n",
    "    feature = []\n",
    "    for f in range(frames):\n",
    "        # Windowing\n",
    "        frame = x[f * FRAME_SHIFT : f * FRAME_SHIFT + FRAME_LEN] * WINDOW\n",
    "        # Pre-emphasis\n",
    "        frame[1:] -= frame[:-1] * PRE_EMPH\n",
    "        # Power spectrum\n",
    "        X = np.abs(np.fft.fft(frame, FFT_SIZE)[:FFT_SIZE/2+1]) ** 2\n",
    "        X[X < POWER_SPECTRUM_FLOOR] = POWER_SPECTRUM_FLOOR  # Avoid zero\n",
    "        # Mel filtering, logarithm, DCT\n",
    "        X = np.dot(D, np.log(np.dot(M,X)))\n",
    "        feature.append(X)\n",
    "    feature = np.row_stack(feature)\n",
    "    return feature"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Configuration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "TIMIT_DIR = '../timit'\n",
    "#filename = 'timit.h5'\n",
    "# no transformation\n",
    "#DTYPE = np.float32\n",
    "#extractor = lambda x: x.astype(DTYPE).reshape(-1, 1)\n",
    "#frame_size=1\n",
    "#frame_shift=1\n",
    "#derivatives=0\n",
    "#preprocessing_description = \"Only minimal preprocessing (normalizing to zero mean and unit standard deviation).\"\n",
    "\n",
    "# mfcc + 1st and 2nd deriv\n",
    "filename = 'timit_mfcc.h5'\n",
    "extractor = extract\n",
    "frame_size=400\n",
    "frame_shift=160\n",
    "derivatives=2\n",
    "DTYPE = np.float64\n",
    "preprocessing_description = \"\"\"Extracted 12 MFCCs coefficients + energy with window size of 25ms and 10ms step. \n",
    "Used hamming window and a pre-emphasis coefficient of 0.97.\n",
    "Also included 1st and 2nd time-derivative of the signal for a total of 39 feature dimensions.\"\"\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Phones\n",
    "Timit uses 61 phones (and erroneously calles them phonemes). But some tasks work with a reduced set of only 39 phones."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "phones = ['aa', 'ae', 'ah', 'ao', 'aw', 'ax', 'ax-h', 'axr', 'ay', 'b', 'bcl',\n",
    "          'ch', 'd', 'dcl', 'dh', 'dx', 'eh', 'el', 'em', 'en', 'eng', 'epi',\n",
    "          'er', 'ey', 'f', 'g', 'gcl', 'h#', 'hh', 'hv', 'ih', 'ix', 'iy',\n",
    "          'jh', 'k', 'kcl', 'l', 'm', 'n', 'ng', 'nx', 'ow', 'oy', 'p', 'pau',\n",
    "          'pcl', 'q', 'r', 's', 'sh', 't', 'tcl', 'th', 'uh', 'uw', 'ux', 'v',\n",
    "          'w', 'y', 'z', 'zh']\n",
    "silence_label = phones.index('h#')\n",
    "\n",
    "reduce_phones = {p: p for p in phones if p != 'q'}  # discard q\n",
    "reduce_phones.update({\n",
    "    'ae': 'aa',\n",
    "    'ax': 'ah', 'ax-h': 'ah',\n",
    "    'axr': 'er',\n",
    "    'hv': 'hh',\n",
    "    'ix': 'ih',\n",
    "    'el': 'l',\n",
    "    'em': 'm',\n",
    "    'en': 'n', 'nx': 'n',\n",
    "    'eng': 'ng',\n",
    "    'zh': 'sh',\n",
    "    'pcl': 'h#', 'tcl': 'h#', 'kcl': 'h#', 'bcl': 'h#', 'dcl': 'h#', 'gcl': 'h#', 'pau': 'h#', 'epi': 'h#',\n",
    "    'ux': 'uw'\n",
    "})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TIMIT Sample Class\n",
    "We first write a small class that captures and extracts all important information about a single TIMIT sequence."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "class TimitSample(object):\n",
    "    @classmethod\n",
    "    def create(cls, directory, name):\n",
    "        f = os.path.join(directory, name.split('.')[0])\n",
    "        f = f.split('/')[-4:]\n",
    "        sample = cls(f[0], f[1], f[2][0], f[2][1:], f[3])\n",
    "        return sample\n",
    "\n",
    "    def __init__(self, usage, dialect, sex, speaker_id, sentence_id,\n",
    "                 start=None, stop=None):\n",
    "        self.usage = usage\n",
    "        self.dialect = dialect\n",
    "        self.sex = sex\n",
    "        self.speaker_id = speaker_id\n",
    "        self.sentence_id = sentence_id\n",
    "        self.start = start\n",
    "        self.stop = stop\n",
    "\n",
    "    def _get_path(self, fileending):\n",
    "        if not fileending.startswith('.'):\n",
    "            fileending = '.' + fileending\n",
    "        return os.path.join(TIMIT_DIR, self.usage, self.dialect, self.sex +\n",
    "                            self.speaker_id, self.sentence_id + fileending)\n",
    "\n",
    "    def get_sentence(self):\n",
    "        filename = self._get_path('txt')\n",
    "        with file(filename, 'r') as f:\n",
    "            content = f.read()\n",
    "            start, stop, sentence = content.split(' ', 2)\n",
    "            return int(start), int(stop), sentence.strip()\n",
    "\n",
    "    def get_words(self):\n",
    "        filename = self._get_path('wrd')\n",
    "        with file(filename, 'r') as f:\n",
    "            content = f.readlines()\n",
    "            wordlist = [c.strip().split(' ', 2) for c in content]\n",
    "            return [(int(start), int(stop), word)\n",
    "                    for start, stop, word in wordlist\n",
    "                    if (self.start is None or int(start) >= self.start) and\n",
    "                       (self.stop is None or int(stop) <= self.stop)]\n",
    "\n",
    "    def get_phones(self):\n",
    "        filename = self._get_path('phn')\n",
    "        with file(filename, 'r') as f:\n",
    "            content = f.readlines()\n",
    "            phone_list = [c.strip().split(' ', 2) for c in content]\n",
    "            return [(int(start), int(stop), phone, phones.index(phone))\n",
    "                    for start, stop, phone in phone_list\n",
    "                    if (self.start is None or int(start) >= self.start) and\n",
    "                       (self.stop is None or int(stop) <= self.stop)]\n",
    "\n",
    "    def get_audio_data(self):\n",
    "        filename = os.path.join(TIMIT_DIR, self.usage, self.dialect,\n",
    "                                self.sex + self.speaker_id,\n",
    "                                self.sentence_id + '.wav')\n",
    "        f = al.Sndfile(filename, 'r')\n",
    "        data = f.read_frames(f.nframes, dtype=np.float64)\n",
    "        return data[self.start:self.stop]\n",
    "\n",
    "    def get_labels(self, frame_size=1, frame_shift=1):\n",
    "        phones = self.get_phones()\n",
    "        begin = self.start if self.start else 0\n",
    "        p_extended = [silence_label] * (phones[0][0] - begin)\n",
    "        for p in phones:\n",
    "            p_extended += [p[3]] * (int(p[1]) - int(p[0]))\n",
    "        end = phones[-1][1]\n",
    "        windows = zip(range(0, end - begin - frame_size + 1, frame_shift),\n",
    "                      range(frame_size, end - begin + 1, frame_shift))\n",
    "        labels = [np.bincount(p_extended[w[0]:w[1]]).argmax() for w in windows]\n",
    "        return np.array(labels, dtype=np.byte)\n",
    "\n",
    "    def get_features(self, extractor, frame_size=1, frame_shift=1, derivatives=0):\n",
    "        d = self.get_audio_data()\n",
    "        features = extractor(d)\n",
    "\n",
    "        feature_derivs = [features]\n",
    "        for i in range(derivatives):\n",
    "            feature_derivs.append(np.gradient(feature_derivs[-1])[0])\n",
    "\n",
    "        all_features = np.hstack(feature_derivs)\n",
    "        labels = self.get_labels(frame_size, frame_shift)\n",
    "        return all_features, labels\n",
    "\n",
    "    def __unicode__(self):\n",
    "        return '<TimitSample ' + '/'.join([self.usage, self.dialect,\n",
    "                                           self.sex + self.speaker_id,\n",
    "                                           self.sentence_id]) + '>'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def read_all_samples():\n",
    "    samples = []\n",
    "    for dirname, dirnames, filenames in os.walk(TIMIT_DIR):\n",
    "        samples += [TimitSample.create(dirname, n)\n",
    "                    for n in filenames if n.endswith('.wav')]\n",
    "    return samples\n",
    "\n",
    "def filter_samples(samples, usage=None, dialect=None, sex=None, speaker_id=None,\n",
    "                   sentence_id=None):\n",
    "    def match(s):\n",
    "        return (usage is None or s.usage == usage) and \\\n",
    "               (dialect is None or s.dialect == dialect) and \\\n",
    "               (sex is None or s.sex == sex) and \\\n",
    "               (speaker_id is None or s.speaker_id == speaker_id) and \\\n",
    "                (sentence_id is None or s.sentence_id == sentence_id)\n",
    "    return [s for s in samples if match(s)]\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Extract features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def get_features_and_labels_for(samples):\n",
    "    ds_list = [s.get_features(extractor, derivatives=derivatives, frame_size=frame_size, frame_shift=frame_shift)\n",
    "               for s in samples]\n",
    "        \n",
    "    maxlen = max(f.shape[0] for f, l in ds_list)\n",
    "    padded_features = []\n",
    "    padded_labels = []\n",
    "    masks = []\n",
    "    for f, l in ds_list:\n",
    "        pad_length_f = maxlen - f.shape[0]\n",
    "        pad_length_l = maxlen - l.shape[0]\n",
    "\n",
    "        mask = np.ones_like(l)\n",
    "        padded_features.append(np.vstack((f, np.zeros((pad_length_f, f.shape[1]), dtype=DTYPE))))\n",
    "        padded_labels.append(np.hstack((l, np.ones(pad_length_l, dtype=DTYPE) * silence_label)))\n",
    "        masks.append(np.hstack((mask, np.zeros(pad_length_l, dtype=DTYPE))))\n",
    "\n",
    "    features = np.dstack(padded_features).swapaxes(1, 2)\n",
    "    labels = np.vstack(padded_labels).T.reshape(maxlen, -1, 1)\n",
    "    masks = np.vstack(masks).T.reshape(maxlen, -1, 1)\n",
    "    return features, labels, masks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_padded_labels(samples, reduced=False):\n",
    "    if not reduced:\n",
    "        L = [zip(*l.get_phones())[3] for l in samples]\n",
    "    else:\n",
    "        L_tmp = [zip(*l.get_phones())[2] for l in samples]\n",
    "        L = [[reduced_phones.index(reduce_phones[p]) for p in l if p != 'q'] for l in L_tmp]\n",
    "        \n",
    "    L_len = max([len(l) for l in L])\n",
    "    L_padded = -np.ones([L_len, len(L), 1], dtype=np.byte)\n",
    "    for i, l in enumerate(L):\n",
    "        L_padded[:len(l), i, 0] = l\n",
    "    return L_padded"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Normalization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_means(input_data, mask=None):\n",
    "    \"\"\"\n",
    "    Get the mean values for every feature in the batch of sequences X by\n",
    "    considering only masked-in entries.\n",
    "    @param input_data: Batch of sequences. shape = (time, sample, feature)\n",
    "    @param mask: Optional mask for the sequences. shape = (time, sample, 1)\n",
    "    @return: mean value for each feature. shape = (features, )\n",
    "    \"\"\"\n",
    "    if mask is not None:\n",
    "        return input_data[:, :, :].reshape(-1, input_data.shape[2])[\n",
    "            mask.flatten() == 1].mean(0)\n",
    "    else:\n",
    "        return input_data[:, :, :].mean((0, 1))\n",
    "\n",
    "\n",
    "def get_stds(input_data, mask=None, channel_mask=None):\n",
    "    \"\"\"\n",
    "    Get the standard deviation for every feature in the batch of sequences X by\n",
    "    considering only masked-in entries.\n",
    "    @param input_data: Batch of sequences. shape = (time, sample, feature)\n",
    "    @param mask: Optional mask for the sequences. shape = (time, sample, 1)\n",
    "    @return: standard deviation of each feature. shape = (features, )\n",
    "    \"\"\"\n",
    "    if mask is not None:\n",
    "        return input_data[:, :, :].reshape(-1, input_data.shape[2])[\n",
    "            mask.flatten() == 1].std(0)\n",
    "    else:\n",
    "        return input_data[:, :, :].std((0, 1))\n",
    "\n",
    "\n",
    "def subtract_means(input_data, means, mask=None):\n",
    "    \"\"\"\n",
    "    Subtract the means from the masked-in entries of a batch of sequences X.\n",
    "    This operation is performed in-place, i.e. the input_data will be modified.\n",
    "\n",
    "    @param input_data: Batch of sequences. shape = (time, sample, feature)\n",
    "    @param means: The means to subtract. shape = (features, )\n",
    "    @param mask: Optional mask for the sequences. shape = (time, sample, 1)\n",
    "    @param channel_mask: Optional mask for the channels. shape = (feature,)\n",
    "    \"\"\"\n",
    "    if mask is not None:\n",
    "        j = 0\n",
    "        for i in range(input_data.shape[2]):\n",
    "            input_data[:, :, i][mask[:, :, 0] == 1] -= means[j]\n",
    "            j += 1\n",
    "    else:\n",
    "        input_data[:, :, :] -= means\n",
    "\n",
    "\n",
    "def divide_by_stds(input_data, stds, mask=None):\n",
    "    \"\"\"\n",
    "    Divide masked-in entries of input_data by the stds.\n",
    "\n",
    "    @param input_data: Batch of sequences. shape = (time, sample, feature)\n",
    "    @param stds: The standard deviations for every feature. shape = (features, )\n",
    "    @param mask: Optional mask for the sequences. shape = (time, sample, 1)\n",
    "    \"\"\"\n",
    "    if mask is not None:\n",
    "        j = 0\n",
    "        for i in range(input_data.shape[2]):\n",
    "            input_data[:, :, i][mask[:, :, 0] == 1] /= stds[j]\n",
    "            j += 1\n",
    "    else:\n",
    "        input_data[:, :, :] /= stds\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Playin around"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "all_samples = read_all_samples()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "samples = all_samples[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/greff/venv/py2/lib/python2.7/site-packages/ipykernel/__main__.py:74: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n"
     ]
    }
   ],
   "source": [
    "X, T, M = get_features_and_labels_for(samples)\n",
    "L = get_padded_labels(samples)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "means = get_means(X, M)\n",
    "subtract_means(X, means, M)\n",
    "stds = get_stds(X, M)\n",
    "divide_by_stds(X, stds, M)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ -1.36270510e-01,  -2.64865545e-02,  -1.88458491e-03,\n",
       "        -5.35061361e-03,  -4.14322723e-04,  -1.62373580e-03,\n",
       "        -4.65313835e-04,   1.66968832e-04,   1.34837504e-03,\n",
       "         2.84097840e-04,   3.70963168e-04,   9.10096640e-05,\n",
       "        -4.71527965e-05,  -1.85187170e-03,  -1.06796464e-03,\n",
       "         1.74962193e-03,   7.97651630e-04,   6.28126471e-04,\n",
       "        -1.48645415e-03,  -6.65244577e-04,  -3.86702196e-04,\n",
       "        -1.08519194e-04,   1.22560158e-03,   6.08361830e-04,\n",
       "         8.05459867e-04,  -6.19838093e-04,  -3.79369618e-04,\n",
       "        -7.56204580e-04,   6.21650029e-04,   9.32087564e-05,\n",
       "         7.47136741e-04,  -3.67511093e-04,  -2.92121960e-04,\n",
       "        -1.71338698e-04,  -1.18740905e-04,   3.18499112e-04,\n",
       "         2.22929043e-04,   5.36611170e-04,  -8.22041047e-04])"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.mean((0, 1)) # checking the mean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9473095372851984"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.reshape(-1, 1)[M.flatten() == 1].std() # checking the variance"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preparing the full(original) TIMIT dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "with h5py.File(filename, 'w') as f:\n",
    "    orig = f.create_group('original')\n",
    "    orig.attrs['description'] = \"\"\"\n",
    "    TIMIT\n",
    "    =====\n",
    "    \n",
    "    This is the original TIMIT dataset.\n",
    "    \n",
    "    Preprocessing\n",
    "    -------------\n",
    "    {}\n",
    "    \n",
    "    Content\n",
    "    -------\n",
    "    default: All audio data padded to be of equal length\n",
    "    targets: Phone index for each frame (same shape as default)\n",
    "    masks: Binary array indicating for each frame whether it is part of a sequence (1) or just padding (0)\n",
    "    labels: Integer array with all the phone indices for a labelling task (not framewise). Padded with -1\n",
    "    names: list of filenames in the original dataset for each sample\n",
    "    \"\"\".format(preprocessing_description)\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### training data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4620"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_samples = filter_samples(all_samples, usage='train')\n",
    "len(train_samples)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "X, T, M = get_features_and_labels_for(train_samples)\n",
    "L = get_padded_labels(train_samples)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "means = get_means(X, M)\n",
    "subtract_means(X, means, M)\n",
    "\n",
    "stds = get_stds(X, M)\n",
    "divide_by_stds(X, stds, M)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "shuffling = range(len(train_samples))\n",
    "shuffle(shuffling)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X = X[:, shuffling, :]\n",
    "T = T[:, shuffling, :]\n",
    "M = M[:, shuffling, :]\n",
    "L = L[:, shuffling, :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "timit_train_names = [train_samples[i]._get_path(\".txt\").encode() for i in shuffling]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "with h5py.File(filename, 'a') as f:\n",
    "    orig = f['original']\n",
    "    train = orig.create_group('training')\n",
    "    train.create_dataset('default', data=X, compression='gzip', chunks=(X.shape[0], 1, X.shape[2]))\n",
    "    train.create_dataset('targets', data=T, compression='gzip', chunks=(T.shape[0], 1, T.shape[2]))\n",
    "    train.create_dataset('masks', data=M, compression='gzip', chunks=(M.shape[0], 1, M.shape[2]))\n",
    "    train.create_dataset('labels', data=L)\n",
    "    train.create_dataset('names', data=np.array(timit_train_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "del X, T, M, L"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### test data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1680"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_samples = filter_samples(all_samples, usage='test')\n",
    "len(test_samples)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X_test, T_test, M_test = get_features_and_labels_for(test_samples)\n",
    "L_test = get_padded_labels(test_samples)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "subtract_means(X_test, means, M_test)\n",
    "divide_by_stds(X_test, stds, M_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "timit_test_names = [x._get_path(\".txt\").encode() for x in test_samples]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "with h5py.File(filename, 'a') as f:\n",
    "    orig = f['original']\n",
    "    test = orig.create_group('test')\n",
    "    test.create_dataset('default', data=X_test, compression='gzip', chunks=(X_test.shape[0], 1, X_test.shape[2]))\n",
    "    test.create_dataset('targets', data=T_test, compression='gzip', chunks=(T_test.shape[0], 1, T_test.shape[2]))\n",
    "    test.create_dataset('masks', data=M_test, compression='gzip', chunks=(M_test.shape[0], 1, M_test.shape[2]))\n",
    "    test.create_dataset('labels', data=L_test)\n",
    "    test.create_dataset('names', data=np.array(timit_test_names))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# The reduced Timit Dataset\n",
    "\n",
    "see Phd Thesis of Andrew K. Halberstadt 1998"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "with h5py.File(filename, 'a') as f: \n",
    "#set 'a' to 'w' if you didn't prepare the full (original) TIMIT \n",
    "    orig = f.create_group('reduced')\n",
    "    orig.attrs['description'] = \"\"\"\n",
    "    TIMIT Reduced\n",
    "    =============\n",
    "    \n",
    "    This is the reduced TIMIT dataset. \n",
    "    It only uses a core test set of 24 speakers, discards all the SA samples from training and has a fixed validation set.\n",
    "    (For details see the PhD Thesis of Andrew K. Halberstadt 1998.)\n",
    "    \n",
    "    Preprocessing\n",
    "    -------------\n",
    "    {}\n",
    "    \n",
    "    Content\n",
    "    -------\n",
    "    default: All audio data padded to be of equal length\n",
    "    targets: Phone index for each frame (same shape as default)\n",
    "    masks: Binary array indicating for each frame whether it is part of a sequence (1) or just padding (0)\n",
    "    labels: Integer array with all the phone indices for a labelling task (not framewise). Padded with -1\n",
    "    labels_reduced: Integer array with all the phones mapped to the reduced phone set. (like labels)\n",
    "    names: list of filenames in the original dataset for each sample\n",
    "    \"\"\".format(preprocessing_description)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "24"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "core_test_speakers={\"mdab0\", \"mwbt0\", \"felc0\", \"mtas1\", \"mwew0\", \"fpas0\",\n",
    "                    \"mjmp0\", \"mlnt0\", \"fpkt0\", \"mlll0\", \"mtls0\", \"fjlm0\",\n",
    "                    \"mbpm0\", \"mklt0\", \"fnlp0\", \"mcmj0\", \"mjdh0\", \"fmgd0\",\n",
    "                    \"mgrt0\", \"mnjm0\", \"fdhc0\", \"mjln0\", \"mpam0\", \"fmld0\"}\n",
    "len(core_test_speakers)  # should be 24"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "192"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "core_test_set= [x for x in all_samples if x.sex + x.speaker_id in core_test_speakers and not x.sentence_id.startswith('sa')]\n",
    "len(core_test_set)  # should be 192"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "462"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len({x.speaker_id for x in all_samples if x.usage=='train'})  # should be 462"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3696"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_set = [x for x in all_samples if x.usage=='train' and not x.sentence_id.startswith('sa')]\n",
    "len(train_set)  # should be 3696"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "168"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len({x.speaker_id for x in all_samples if x.usage=='test'}) # should be 168"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "50"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "val_set_speakers={'faks0', 'mmdb1', 'mbdg0', 'fedw0', 'mtdt0', 'fsem0', 'mdvc0', 'mrjm4', 'mjsw0', 'mteb0',\n",
    "                  'fdac1', 'fjem0', 'mgwt0', 'mmdm2', 'mpdf0', 'fcmh0', 'mbwm0', 'mcsh0', 'fadg0', 'mgjf0',\n",
    "                  'mglb0', 'mrtk0', 'mthc0', 'mwjg0', 'fnmr0', 'mbns0', 'mmjr0', 'mdls0', 'mers0', 'fmah0',\n",
    "                  'fdrw0', 'fcal1', 'mmwh0', 'fjsj0', 'mreb0', 'fgjd0', 'fjmg0', 'mjfc0', 'mrjr0', 'fmml0',\n",
    "                  'mjar0', 'fkms0', 'fdms0', 'mtaa0', 'frew0', 'mdlf0', 'mrcs0', 'majc0', 'mroa0', 'mrws1'}\n",
    "len(val_set_speakers) # should be 50"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "400"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "val_set= [x for x in all_samples if x.sex + x.speaker_id in val_set_speakers and not x.sentence_id.startswith('sa')]\n",
    "len(val_set)  # should be 400"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "39"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reduced_phones = sorted({reduce_phones[p] for p in phones if p != 'q'})\n",
    "len(reduced_phones) # should be 39"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preparing the training set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X_train, T_train, M_train = get_features_and_labels_for(train_set)\n",
    "L_train = get_padded_labels(train_set)\n",
    "L_train_reduced = get_padded_labels(train_set, reduced=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "means = get_means(X_train, M_train)\n",
    "subtract_means(X_train, means, M_train)\n",
    "\n",
    "stds = get_stds(X_train, M_train)\n",
    "divide_by_stds(X_train, stds, M_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "shuffling = range(len(train_set))\n",
    "shuffle(shuffling)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train = X_train[:, shuffling, :]\n",
    "T_train = T_train[:, shuffling, :]\n",
    "M_train = M_train[:, shuffling, :]\n",
    "L_train = L_train[:, shuffling, :]\n",
    "L_train_reduced = L_train_reduced[:, shuffling, :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "rtimit_train_names = [train_set[i]._get_path(\".txt\").encode() for i in shuffling]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "with h5py.File(filename, 'a') as f:\n",
    "    orig = f['reduced']\n",
    "    train = orig.create_group('training')\n",
    "    train.create_dataset('default', data=X_train, compression='gzip', chunks=(X_train.shape[0], 1, X_train.shape[2]))\n",
    "    train.create_dataset('targets', data=T_train, compression='gzip', chunks=(T_train.shape[0], 1, T_train.shape[2]))\n",
    "    train.create_dataset('masks', data=M_train, compression='gzip', chunks=(M_train.shape[0], 1, M_train.shape[2]))\n",
    "    train.create_dataset('labels', data=L_train)\n",
    "    train.create_dataset('labels_reduced', data=L_train_reduced)\n",
    "    train.create_dataset('names', data=np.array(rtimit_train_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "del X_train, T_train, M_train, L_train, L_train_reduced"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preparing the validation set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X_val, T_val, M_val = get_features_and_labels_for(val_set)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "subtract_means(X_val, means, M_val)\n",
    "divide_by_stds(X_val, stds, M_val)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "L_val = get_padded_labels(val_set)\n",
    "L_val_reduced = get_padded_labels(val_set, reduced=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "rtimit_val_names = [val_set[i]._get_path(\".txt\").encode() for i in range(len(val_set))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "with h5py.File(filename, 'a') as f:\n",
    "    orig = f['reduced']\n",
    "    validation = orig.create_group('validation')\n",
    "    validation.create_dataset('default', data=X_val, compression='gzip', chunks=(X_val.shape[0], 1, X_val.shape[2]))\n",
    "    validation.create_dataset('targets', data=T_val, compression='gzip', chunks=(T_val.shape[0], 1, T_val.shape[2]))\n",
    "    validation.create_dataset('masks', data=M_val, compression='gzip', chunks=(M_val.shape[0], 1, M_val.shape[2]))\n",
    "    validation.create_dataset('labels', data=L_val)\n",
    "    validation.create_dataset('labels_reduced', data=L_val_reduced)\n",
    "    validation.create_dataset('names', data=np.array(rtimit_val_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "del X_val, T_val, M_val, L_val, L_val_reduced"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## preparing the core test set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X_test, T_test, M_test = get_features_and_labels_for(core_test_set)\n",
    "L_test = get_padded_labels(core_test_set)\n",
    "L_test_reduced = get_padded_labels(core_test_set, reduced=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "subtract_means(X_test, means, M_test)\n",
    "divide_by_stds(X_test, stds, M_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "rtimit_test_names = [core_test_set[i]._get_path(\".txt\").encode() for i in range(len(core_test_set))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "with h5py.File(filename, 'a') as f:\n",
    "    orig = f['reduced']\n",
    "    validation = orig.create_group('test')\n",
    "    validation.create_dataset('default', data=X_test, compression='gzip', chunks=(X_test.shape[0], 1, X_test.shape[2]))\n",
    "    validation.create_dataset('targets', data=T_test, compression='gzip', chunks=(T_test.shape[0], 1, T_test.shape[2]))\n",
    "    validation.create_dataset('masks', data=M_test, compression='gzip', chunks=(M_test.shape[0], 1, M_test.shape[2]))\n",
    "    validation.create_dataset('labels', data=L_test)\n",
    "    validation.create_dataset('labels_reduced', data=L_test_reduced)\n",
    "    validation.create_dataset('names', data=np.array(rtimit_test_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "del X_test, T_test, M_test, L_test, L_test_reduced"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}