dkohlsdorf/unsupervisedrecursivedolphinparsing.ipynb

## unsupervisedrecursivedolphinparsing.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "UnsupervisedRecursiveDolphinParsing.ipynb",
      "provenance": [],
      "collapsed_sections": [
        "DktDaGaRZf6i",
        "3C_8fMhVemnh"
      ],
      "machine_shape": "hm",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/dkohlsdorf/629f06584c314ed6d81e60ecf3cb5ed0/unsupervisedrecursivedolphinparsing.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "wUIYM9D3Y38B",
        "colab_type": "text"
      },
      "source": [
        "\n",
        "# Unsupervised Neural Audio Parsing\n",
        "\n",
        "The basic idea is to use a recursive neural network to parse audio data in the form of spectrograms.\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "zp4nesrYZRIk",
        "colab_type": "text"
      },
      "source": [
        "## Installing and importing\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "HLBMJ1P1hxyZ",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "!apt install -y graphviz\n",
        "\n",
        "!pip install --upgrade --quiet scikit-sound\n",
        "!pip install --upgrade --quiet pygame\n",
        "!pip install --upgrade torchviz\n",
        "!pip install --upgrade graphviz"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "T9DCTfhKh9lF",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import sys\n",
        "import os\n",
        "import random\n",
        "import warnings\n",
        "\n",
        "import matplotlib.image as mpimg\n",
        "import matplotlib.pyplot as plt\n",
        "import numpy as np\n",
        "\n",
        "import torch\n",
        "import torch.nn as nn\n",
        "import torch.nn.functional as F\n",
        "\n",
        "from operator import attrgetter\n",
        "\n",
        "from collections import namedtuple\n",
        "from google.colab import drive\n",
        "from numpy.fft import fft, ifft\n",
        "from sksound.sounds import Sound\n",
        "from torch.autograd import Variable\n",
        "from torch.nn.utils import clip_grad_norm\n",
        "from graphviz import Digraph\n",
        "\n",
        "from sklearn.cluster import KMeans"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "wt4cEztriUAa",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "drive.mount('/content/drive')\n",
        "\n",
        "WDP_PATH     = '/content/drive/My Drive/Wild Dolphin Project acoustics'\n",
        "SPOTTED_2012 = '{}/2012 wav extraction/SP mpeg'.format(WDP_PATH)\n",
        "\n",
        "DKOHL            = '/content/drive/My Drive/dolphin/'\n",
        "DKOHL_PATH_2012  = '/content/drive/My Drive/dolphin/experiments2015/data_dolphin/experiments/2012'\n",
        "DKOHL_PATH_CTX   = '/content/drive/My Drive/dolphin/experiments2015/data_dolphin/experiments/Context'\n",
        "DKOHL_PATH_WSTL  = '/content/drive/My Drive/dolphin/experiments2015/data_dolphin/catalogue/whistle_snippets'\n",
        "DKOHL_PATH_BRST  = '/content/drive/My Drive/dolphin/experiments2015/data_dolphin/catalogue/burst_snippet'\n",
        "\n",
        "DKOHL_PATH_NOISE = '/content/drive/My Drive/dolphin/experiments2015/data_dolphin/catalogue/noise_snippets'"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "EifFnlBnZXbh",
        "colab_type": "text"
      },
      "source": [
        "## Unsupervised Recursive Neural Net For Audio\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "VUBGLtdgiLrY",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "CONV = 128             # Number of convolutional filters\n",
        "K    = 412             # Number of input dimensions\n",
        "WIN  = 10              # Window size in time of filter\n",
        "H    = 68             # Dimension of hidden layer in grammar\n",
        "LAB  = 2 * WIN         # Label size of prediction window\n",
        "D    = 10\n",
        "BEAM_WIDTH = 2\n",
        "\n",
        "class FeatureExtractor(nn.Module):\n",
        "    '''\n",
        "    The feature extractor is a simple convolutional neural network.\n",
        "    Given an input sequence (1,1,T,D) and K filters, we will tranform \n",
        "    the input into another sequence (T, K).\n",
        "    '''\n",
        "    def __init__(self):\n",
        "        super(FeatureExtractor, self).__init__()    \n",
        "        self.conv1 = nn.Conv2d(in_channels=1,  out_channels=CONV,  kernel_size=(WIN, D), stride=1, padding=0)\n",
        "\n",
        "    def forward(self, x):      \n",
        "        x = F.relu(self.conv1(x))\n",
        "        x = F.max_pool2d(x, (WIN, K)) \n",
        "        x = x.view(CONV, -1).transpose(0, 1)\n",
        "        return x\n",
        "\n",
        "    \n",
        "class NeuralGrammar(nn.Module):\n",
        "    '''\n",
        "    Given a tree node we project the left child and right child into\n",
        "    a feature space cl and cr. The concatenated [cl, cr] is projected\n",
        "    into the representation of that node. Furthermore, we predict\n",
        "    the label from the hidden representation.\n",
        "    '''\n",
        "\n",
        "    def __init__(self):\n",
        "        super(NeuralGrammar, self).__init__()\n",
        "        self.left       = nn.Linear(CONV, H, bias=True)\n",
        "        self.right      = nn.Linear(CONV, H, bias=True)\n",
        "        self.parent     = nn.Linear(2 * H,   CONV, bias=True)\n",
        "        self.projection = nn.Linear(CONV, LAB * K, bias=True) \n",
        "\n",
        "    def forward(self, x):        \n",
        "        l = x.left_child.representation\n",
        "        r = x.right_child.representation\n",
        "        y = x.label\n",
        "        x = torch.cat([torch.relu(self.left(l)), torch.relu(self.right(r))], 0)\n",
        "        p = torch.tanh(self.parent(x))\n",
        "        x = self.projection(p)\n",
        "        score = torch.sum(torch.pow(x - y, 2)) / y.shape[0] # + noise\n",
        "        return (p, score)       \n",
        "\n",
        "\n",
        "class MergingState(namedtuple('MergingState', 'nodes, rmse')):\n",
        "\n",
        "    def done(self):\n",
        "        return len(self.nodes) == 1\n",
        "\n",
        "    def root(self):\n",
        "        assert self.done\n",
        "        return nodes[0]\n",
        "    \n",
        "\n",
        "class TreeNode:\n",
        "    '''\n",
        "    A tree node in a neural grammar. \n",
        "    Represented as a binary tree. Each node is also represented by\n",
        "    an embedding vector. The label is what we want to predict.\n",
        "    The start and stop is the span of this subtree    \n",
        "    '''\n",
        "\n",
        "    def __init__(self, start, stop, representation=None):\n",
        "        self.representation = representation\n",
        "        self.label          = None\n",
        "        self.left_child     = None\n",
        "        self.right_child    = None\n",
        "        self.start          = start\n",
        "        self.stop           = stop\n",
        "    \n",
        "    @property\n",
        "    def span_length(self):\n",
        "      return self.stop - self.start    \n",
        "\n",
        "    def is_leaf(self):\n",
        "        return self.left_child is None and self.right_child is None        \n",
        "    \n",
        "    def max_depth(self):\n",
        "        if self.is_leaf():\n",
        "          return 1\n",
        "        else:\n",
        "          return max(\n",
        "              self.left_child.max_depth()  + 1, \n",
        "              self.right_child.max_depth() + 1\n",
        "          )        \n",
        "\n",
        "    def span(self, spectrogram):\n",
        "      return spectrogram[self.start:self.stop]\n",
        "\n",
        "    def mk_label(self, spectrogram):\n",
        "        '''\n",
        "        Build a label from different spans\n",
        "        if the span is larger than the desired label: max-pool down to desired shape excess is cut off \n",
        "        if the span is smaller than the desired label: pad with zeros    \n",
        "        '''\n",
        "        T = self.span_length\n",
        "        n_times = max(int(T / LAB), 1)      \n",
        "        large = torch.max_pool2d(spectrogram[:, :,self.start:self.stop, :], (n_times, 1))\n",
        "        y = large[0,0,0:LAB,:]\n",
        "        return y.reshape(-1)\n",
        "\n",
        "    @classmethod\n",
        "    def bfs_tree(cls, data, feature_extractor, grammar):\n",
        "        '''\n",
        "        Breadth first search merging using a feature extractor and a grammar\n",
        "\n",
        "        returns all possible results sorted ascending by reconstruction error \n",
        "\n",
        "        data: a multi dimensional sequence (1,1,T,D)\n",
        "        feature extractor: a neural net converting the sequence (1,1,T,D) to another sequence (t,d)\n",
        "        grammar: a neural network that operates on trees\n",
        "        '''\n",
        "        base    = feature_extractor(data)        \n",
        "        T, D    = base.shape\n",
        "        leafs   = [cls(i * WIN, (i + 1) * WIN, base[i, :]) for i in range(T)] \n",
        "        queue   = [MergingState(leafs, 0.0)]\n",
        "        results = []\n",
        "        while len(queue) > 0:\n",
        "            mstate = queue[0]\n",
        "            queue  = queue[1:]\n",
        "            if mstate.done():\n",
        "              results.append(mstate)\n",
        "            else:\n",
        "              state, rmse = mstate\n",
        "              candidates = []\n",
        "              for i in range(1, len(state)):\n",
        "                next_state  = state.copy()            \n",
        "                hypothesis = cls(next_state[i - 1].start, next_state[i].stop)              \n",
        "                hypothesis.label          = hypothesis.mk_label(data)\n",
        "                hypothesis.left_child     = next_state[i - 1]\n",
        "                hypothesis.right_child    = next_state[i]\n",
        "                representation, score     = grammar(hypothesis)\n",
        "                hypothesis.representation = representation\n",
        "                next_state[i - 1:i + 1]   = [hypothesis]\n",
        "                candidates.append(MergingState(next_state, rmse + score))     \n",
        "              queue.extend(sorted(candidates, key=attrgetter('rmse'))[0:BEAM_WIDTH])\n",
        "        return sorted(results, key=attrgetter('rmse'))\n",
        "\n",
        "    @classmethod\n",
        "    def greedy_tree(cls, data, feature_extractor, grammar):\n",
        "        '''\n",
        "        Greedily merges bottom up using a feature extractor and grammar\n",
        "\n",
        "        data: a multi dimensional sequence (1,1,T,D)\n",
        "        feature extractor: a neural net converting the sequence (1,1,T,D) to another sequence (t,d)\n",
        "        grammar: a neural network that operates on trees\n",
        "        '''\n",
        "        base   = feature_extractor(data)        \n",
        "        T, D   = base.shape\n",
        "        leafs  = [cls(i * WIN, (i + 1) * WIN, base[i, :]) for i in range(T)] \n",
        "        rmse   = torch.zeros(1)\n",
        "        while len(leafs) >= 2:\n",
        "            min_score       = float('inf')\n",
        "            min_hypothesis  = None \n",
        "            min_idx         = None \n",
        "            for i in range(1, len(leafs)):\n",
        "                # difference of reconstruction of the center of the span of the subtree in the spectrogram is the score\n",
        "                # inspired by continuous bag of words\n",
        "                hypothesis = cls(leafs[i - 1].start, leafs[i].stop)\n",
        "                hypothesis.label          = hypothesis.mk_label(data)\n",
        "                hypothesis.left_child     = leafs[i - 1]\n",
        "                hypothesis.right_child    = leafs[i]\n",
        "                representation, score     = grammar(hypothesis)\n",
        "                hypothesis.representation = representation\n",
        "                if score < min_score:\n",
        "                    min_score      = score\n",
        "                    min_hypothesis = hypothesis\n",
        "                    min_idx        = i    \n",
        "            rmse += min_score\n",
        "            leafs[min_idx - 1:min_idx + 1] = [min_hypothesis]            \n",
        "        rmse = torch.sqrt(rmse / LAB)\n",
        "        return leafs[0], rmse\n",
        "    \n",
        "\n",
        "def parse(input, fe, grammar):\n",
        "  return TreeNode.greedy_tree(input, fe, grammar)       \n",
        "\n",
        "    \n",
        "def learn_parser(inputs, epochs=25):    \n",
        "    '''\n",
        "    Learns a parsing neural network along with the features\n",
        "    \n",
        "    inputs: a set of sequences [(1, 1, T, D)]\n",
        "    epochs: number of training epochs\n",
        "    '''\n",
        "    fe        = FeatureExtractor()\n",
        "    grammar   = NeuralGrammar() \n",
        "    optimizer = torch.optim.Adam(list(fe.parameters()) + list(grammar.parameters()))\n",
        "    trees     = []\n",
        "    rmse_total = None\n",
        "    for epoch in range(0, epochs):\n",
        "        rmse_total = torch.ones(1)         \n",
        "        trees      = []\n",
        "        # E - Step: Building structure \n",
        "        for i in inputs:\n",
        "            root, score = TreeNode.bfs_tree(i, fe, grammar)[0].root() # TreeNode.greedy_tree(i, fe, grammar)       \n",
        "            trees.append(root)\n",
        "            rmse_total += score\n",
        "        # M - Step: Propagate error (collects all gradients first in that version)\n",
        "        rmse_total = rmse_total / len(inputs)\n",
        "        rmse_total.backward()   \n",
        "        clip_grad_norm(fe.parameters(), 1.0, norm_type=2.)\n",
        "        clip_grad_norm(grammar.parameters(), 1.0, norm_type=2.)\n",
        "        loss=optimizer.step()\n",
        "        print(\"EPOCH: {}, RMSE: {}\".format(epoch, float(rmse_total)))\n",
        "    return trees, fe, grammar, rmse_total\n",
        "\n",
        "\n",
        "def fwd_spectrogram(audio, win=1024, step=512):\n",
        "    '''\n",
        "    Compute the spectrogram of audio data\n",
        "\n",
        "    audio: one channel audio\n",
        "    win: window size for dft sliding window\n",
        "    step: step size for dft sliding windo\n",
        "    '''\n",
        "    spectrogram = []\n",
        "    hanning = np.hanning(win)\n",
        "    for i in range(win, len(audio), step):\n",
        "        dft = fft(audio[i - win: i])\n",
        "        spectrogram.append(dft)\n",
        "    return np.array(spectrogram)\n",
        "\n",
        "\n",
        "def spectrogram_from_file(filename, max_len=1000):\n",
        "  '''\n",
        "  Read audio and convert to z-normalized spectrogram  \n",
        "\n",
        "  filename: path to the file\n",
        "  max_len: clip files\n",
        "  '''\n",
        "  sound = Sound(filename) \n",
        "  data  = sound.data\n",
        "  if len(data.shape) > 1:\n",
        "    data = data[:, 0]    \n",
        "  spec = np.abs(fwd_spectrogram(data))[:,512:1024 - 100]\n",
        "  mu   = np.mean(spec) \n",
        "  std  = np.std(spec) + 1.0 \n",
        "  spec = (spec - mu) / std\n",
        "  spec = spec[0:max_len, :]\n",
        "  t, d = spec.shape\n",
        "  spec = spec.reshape(1, 1, t, d)\n",
        "  spec = torch.tensor(spec.astype(np.float32))\n",
        "  print(\"Read File: {} with dim {}\".format(filename, spec.shape))  \n",
        "  return spec "
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "xZTYuTA7iPPl",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "files = []\n",
        "for filename in os.listdir(DKOHL_PATH_2012):\n",
        "    if (filename.endswith('m4a') or filename.endswith('.wav')) and not filename.startswith('._'):\n",
        "        files.append(\"{}/{}\".format(DKOHL_PATH_2012, filename))\n",
        "\n",
        "for r, d, f in os.walk(DKOHL_PATH_CTX):\n",
        "    for file in f:\n",
        "        if (file.endswith('m4a') or file.endswith('.wav')) and not file.startswith('._'):\n",
        "            files.append(os.path.join(r, file))\n",
        "print(files)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "lgJ4xpRbaHOQ",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "trees, fe, grammar, scores = learn_parser([torch.tensor(spectrogram_from_file(file, 10000)) for file in files[0:1]], epochs=100)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "DktDaGaRZf6i",
        "colab_type": "text"
      },
      "source": [
        "## if we want to see the whole autograd graph"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "i53KD76leYzM",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from torchviz import make_dot\n",
        "x = make_dot(scores)\n",
        "x.save('{}/graph.dot'.format(DKOHL))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "SMFvpCzyRjqo",
        "colab_type": "text"
      },
      "source": [
        "## Load and Save"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "HNY8BJRAi5k1",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "torch.save(fe.state_dict(), '{}/convolve.pt'.format(DKOHL))\n",
        "torch.save(grammar.state_dict(), '{}/grammar.pt'.format(DKOHL))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "CtP12Txt_Pv7",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "fe = FeatureExtractor()\n",
        "fe.load_state_dict(torch.load('{}/convolve.pt'.format(DKOHL)))\n",
        "grammar = NeuralGrammar()\n",
        "grammar.load_state_dict(torch.load('{}/grammar.pt'.format(DKOHL)))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "pLKOIsm1ZkoY",
        "colab_type": "text"
      },
      "source": [
        "## Visualize Filters"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "l39xTUZCpMTx",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "params = list(fe.parameters())\n",
        "k = 1\n",
        "plt.figure(figsize=(80, 50))\n",
        "for i in range(CONV):\n",
        "  plt.subplot(1, CONV, k)\n",
        "  cur_axes = plt.gca()\n",
        "  cur_axes.axes.get_xaxis().set_ticklabels([])\n",
        "  cur_axes.axes.get_yaxis().set_ticklabels([])\n",
        "  filter = np.array(params[0][i,0,:,:].data)\n",
        "  plt.imshow(filter.T)\n",
        "  k += 1\n",
        "plt.savefig('{}/features.png'.format(DKOHL))\n",
        "plt.show()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "oVR1plFTrBfT",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "plt.figure(figsize=(15, 100))\n",
        "k = 1\n",
        "for i in range(10):\n",
        "  x = spectrogram_from_file(files[i], 10000)\n",
        "  plt.subplot(20, 1, k)\n",
        "  plt.imshow(1.0 - x.detach().numpy()[0,0,:,:].T, cmap='gray', aspect=\"auto\")\n",
        "  cur_axes = plt.gca()\n",
        "  cur_axes.axes.get_xaxis().set_ticklabels([])\n",
        "  cur_axes.axes.get_yaxis().set_ticklabels([])\n",
        "  k += 1\n",
        "  plt.subplot(20, 1, k)\n",
        "  base = fe(x)\n",
        "  plt.imshow(base.detach().numpy().T,aspect=\"auto\")\n",
        "  cur_axes = plt.gca()\n",
        "  cur_axes.axes.get_xaxis().set_ticklabels([])\n",
        "  cur_axes.axes.get_yaxis().set_ticklabels([])\n",
        "  k += 1\n",
        "plt.show()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "3C_8fMhVemnh",
        "colab_type": "text"
      },
      "source": [
        "## Visualize Trees"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "WTamfK-rdtyE",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def tree_walker(node, spectrogram, path, json = \"\", name = 'r'):\n",
        "  if not os.path.exists(path):\n",
        "    os.mkdir(path)\n",
        "  start  = node.start\n",
        "  stop   = node.stop\n",
        "  spec = spectrogram[start:stop, :]\n",
        "  plt.imshow(1.0 - spec.T, cmap='gray')  \n",
        "  cur_axes = plt.gca()\n",
        "  cur_axes.axes.get_xaxis().set_ticklabels([])\n",
        "  cur_axes.axes.get_yaxis().set_ticklabels([])\n",
        "  plt.savefig('{}/{}.png'.format(path, name))\n",
        "  plt.close()\n",
        "\n",
        "  image = '{}.png'.format(name)\n",
        "  if not node.is_leaf():\n",
        "    left  = tree_walker(node.left_child,  spectrogram, path, json,  name + '0')\n",
        "    right = tree_walker(node.right_child, spectrogram, path, json, name + '1')\n",
        "    return '{{image: \"{}\", children: [{}, {}]}}'.format(image, left, right)\n",
        "  else:\n",
        "    return '{{image: \"{}\"}}'.format(image)\n",
        "\n",
        "def write_tree_json(i, path):\n",
        "  json = tree_walker(trees[i], spectrogram_from_file(files[i], 10000)[0,0,:,:].detach().numpy(), path)\n",
        "  return \"\"\" \n",
        "  var chart_config = {{ \n",
        "    chart: {{container: \"#dolphin\"}},\n",
        "    nodeStructure: {} \n",
        "  }};\n",
        "  \"\"\".format(json)\n",
        "\n",
        "print(write_tree_json(1, '{}/22092019/'.format(DKOHL)))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "zqSlzTYGQWs2",
        "colab_type": "text"
      },
      "source": [
        "Paste into file\n",
        "\n",
        "```\n",
        "<!DOCTYPE html>\n",
        "<html>\n",
        "    <head>\n",
        "    <meta charset=\"utf-8\">\n",
        "    <meta http-equiv=\"X-UA-Compatible\" content=\"IE=edge,chrome=1\">\n",
        "    <meta name=\"viewport\" content=\"width=device-width\">\n",
        "    <title> Basic example </title>\n",
        "    <link rel=\"stylesheet\" href=\"../../Treant.css\">\n",
        "    <link rel=\"stylesheet\" href=\"basic-example.css\">\n",
        "\n",
        "</head>\n",
        "<body>\n",
        "    <div id=\"dolphin\"></div>\n",
        "    <script src=\"../../vendor/raphael.js\"></script>\n",
        "    <script src=\"../../Treant.js\"></script>\n",
        "    <script src=\"dolphin.js\"></script>\n",
        "    <script>\n",
        "        new Treant( chart_config );\n",
        "    </script>\n",
        "</body>\n",
        "</html>\n",
        "```"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Dm7JOkMde55i",
        "colab_type": "text"
      },
      "source": [
        "## Visualize Span Clusters"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "RhZQjWE_e-7U",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def clustering_candidates(tree, min_size, max_size):\n",
        "  representations = []\n",
        "  if tree.span_length >= min_size and tree.span_length <= max_size:\n",
        "    representations = [tree]\n",
        "  if not tree.is_leaf():\n",
        "    representations += clustering_candidates(tree.left_child,  min_size, max_size)\n",
        "    representations += clustering_candidates(tree.right_child, min_size, max_size)  \n",
        "  return representations\n",
        "\n",
        "representations = []\n",
        "for file in files:  \n",
        "  representations += [(file, r) for r in clustering_candidates(parse(spectrogram_from_file(file, 10000), fe, grammar)[0], 50, 250)]  \n",
        "  print(len(representations))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "O-DSDp6Eu4Wt",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "k = 256\n",
        "x  = np.array([r.representation.detach().numpy() for _, r in representations])\n",
        "km = KMeans(n_clusters=k)\n",
        "y = km.fit_predict(x)\n",
        "print(\"Train: {}, Test: {}\".format(x.shape, y.shape))\n",
        "print(y)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "WvfvZzqawsjF",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import warnings\n",
        "warnings.filterwarnings(\"ignore\")\n",
        "for c in range(0, 256):\n",
        "  n = 1\n",
        "  for i in range(x.shape[0]):\n",
        "    if c == y[i]:\n",
        "      n += 1\n",
        "  if n >= 3 and n < 20:\n",
        "    k = 1\n",
        "    for i in range(x.shape[0]):\n",
        "      if c == y[i]:\n",
        "        f = plt.subplot(1, n, k)\n",
        "        (f, r) = representations[i]\n",
        "        start = r.start\n",
        "        stop  = r.stop\n",
        "        spectrogram = spectrogram_from_file(f, 10000)[0,0,start:stop,:].detach().numpy()\n",
        "        plt.imshow(1.0 - spectrogram.T, cmap='gray', aspect=\"auto\")  \n",
        "        cur_axes = plt.gca()\n",
        "        cur_axes.axes.get_xaxis().set_ticklabels([])\n",
        "        cur_axes.axes.get_yaxis().set_ticklabels([])\n",
        "        k += 1\n",
        "    plt.show()\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "74Kq_pWSQQ0r",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        ""
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "UnsupervisedRecursiveDolphinParsing.ipynb",
	"provenance": [],
	"collapsed_sections": [
	"DktDaGaRZf6i",
	"3C_8fMhVemnh"
	],
	"machine_shape": "hm",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"accelerator": "GPU"
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/dkohlsdorf/629f06584c314ed6d81e60ecf3cb5ed0/unsupervisedrecursivedolphinparsing.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "wUIYM9D3Y38B",
	"colab_type": "text"
	},
	"source": [
	"\n",
	"# Unsupervised Neural Audio Parsing\n",
	"\n",
	"The basic idea is to use a recursive neural network to parse audio data in the form of spectrograms.\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "zp4nesrYZRIk",
	"colab_type": "text"
	},
	"source": [
	"## Installing and importing\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "HLBMJ1P1hxyZ",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"!apt install -y graphviz\n",
	"\n",
	"!pip install --upgrade --quiet scikit-sound\n",
	"!pip install --upgrade --quiet pygame\n",
	"!pip install --upgrade torchviz\n",
	"!pip install --upgrade graphviz"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "T9DCTfhKh9lF",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"import sys\n",
	"import os\n",
	"import random\n",
	"import warnings\n",
	"\n",
	"import matplotlib.image as mpimg\n",
	"import matplotlib.pyplot as plt\n",
	"import numpy as np\n",
	"\n",
	"import torch\n",
	"import torch.nn as nn\n",
	"import torch.nn.functional as F\n",
	"\n",
	"from operator import attrgetter\n",
	"\n",
	"from collections import namedtuple\n",
	"from google.colab import drive\n",
	"from numpy.fft import fft, ifft\n",
	"from sksound.sounds import Sound\n",
	"from torch.autograd import Variable\n",
	"from torch.nn.utils import clip_grad_norm\n",
	"from graphviz import Digraph\n",
	"\n",
	"from sklearn.cluster import KMeans"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "wt4cEztriUAa",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"drive.mount('/content/drive')\n",
	"\n",
	"WDP_PATH = '/content/drive/My Drive/Wild Dolphin Project acoustics'\n",
	"SPOTTED_2012 = '{}/2012 wav extraction/SP mpeg'.format(WDP_PATH)\n",
	"\n",
	"DKOHL = '/content/drive/My Drive/dolphin/'\n",
	"DKOHL_PATH_2012 = '/content/drive/My Drive/dolphin/experiments2015/data_dolphin/experiments/2012'\n",
	"DKOHL_PATH_CTX = '/content/drive/My Drive/dolphin/experiments2015/data_dolphin/experiments/Context'\n",
	"DKOHL_PATH_WSTL = '/content/drive/My Drive/dolphin/experiments2015/data_dolphin/catalogue/whistle_snippets'\n",
	"DKOHL_PATH_BRST = '/content/drive/My Drive/dolphin/experiments2015/data_dolphin/catalogue/burst_snippet'\n",
	"\n",
	"DKOHL_PATH_NOISE = '/content/drive/My Drive/dolphin/experiments2015/data_dolphin/catalogue/noise_snippets'"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "EifFnlBnZXbh",
	"colab_type": "text"
	},
	"source": [
	"## Unsupervised Recursive Neural Net For Audio\n"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "VUBGLtdgiLrY",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"CONV = 128 # Number of convolutional filters\n",
	"K = 412 # Number of input dimensions\n",
	"WIN = 10 # Window size in time of filter\n",
	"H = 68 # Dimension of hidden layer in grammar\n",
	"LAB = 2 * WIN # Label size of prediction window\n",
	"D = 10\n",
	"BEAM_WIDTH = 2\n",
	"\n",
	"class FeatureExtractor(nn.Module):\n",
	" '''\n",
	" The feature extractor is a simple convolutional neural network.\n",
	" Given an input sequence (1,1,T,D) and K filters, we will tranform \n",
	" the input into another sequence (T, K).\n",
	" '''\n",
	" def __init__(self):\n",
	" super(FeatureExtractor, self).__init__() \n",
	" self.conv1 = nn.Conv2d(in_channels=1, out_channels=CONV, kernel_size=(WIN, D), stride=1, padding=0)\n",
	"\n",
	" def forward(self, x): \n",
	" x = F.relu(self.conv1(x))\n",
	" x = F.max_pool2d(x, (WIN, K)) \n",
	" x = x.view(CONV, -1).transpose(0, 1)\n",
	" return x\n",
	"\n",
	" \n",
	"class NeuralGrammar(nn.Module):\n",
	" '''\n",
	" Given a tree node we project the left child and right child into\n",
	" a feature space cl and cr. The concatenated [cl, cr] is projected\n",
	" into the representation of that node. Furthermore, we predict\n",
	" the label from the hidden representation.\n",
	" '''\n",
	"\n",
	" def __init__(self):\n",
	" super(NeuralGrammar, self).__init__()\n",
	" self.left = nn.Linear(CONV, H, bias=True)\n",
	" self.right = nn.Linear(CONV, H, bias=True)\n",
	" self.parent = nn.Linear(2 * H, CONV, bias=True)\n",
	" self.projection = nn.Linear(CONV, LAB * K, bias=True) \n",
	"\n",
	" def forward(self, x): \n",
	" l = x.left_child.representation\n",
	" r = x.right_child.representation\n",
	" y = x.label\n",
	" x = torch.cat([torch.relu(self.left(l)), torch.relu(self.right(r))], 0)\n",
	" p = torch.tanh(self.parent(x))\n",
	" x = self.projection(p)\n",
	" score = torch.sum(torch.pow(x - y, 2)) / y.shape[0] # + noise\n",
	" return (p, score) \n",
	"\n",
	"\n",
	"class MergingState(namedtuple('MergingState', 'nodes, rmse')):\n",
	"\n",
	" def done(self):\n",
	" return len(self.nodes) == 1\n",
	"\n",
	" def root(self):\n",
	" assert self.done\n",
	" return nodes[0]\n",
	" \n",
	"\n",
	"class TreeNode:\n",
	" '''\n",
	" A tree node in a neural grammar. \n",
	" Represented as a binary tree. Each node is also represented by\n",
	" an embedding vector. The label is what we want to predict.\n",
	" The start and stop is the span of this subtree \n",
	" '''\n",
	"\n",
	" def __init__(self, start, stop, representation=None):\n",
	" self.representation = representation\n",
	" self.label = None\n",
	" self.left_child = None\n",
	" self.right_child = None\n",
	" self.start = start\n",
	" self.stop = stop\n",
	" \n",
	" @property\n",
	" def span_length(self):\n",
	" return self.stop - self.start \n",
	"\n",
	" def is_leaf(self):\n",
	" return self.left_child is None and self.right_child is None \n",
	" \n",
	" def max_depth(self):\n",
	" if self.is_leaf():\n",
	" return 1\n",
	" else:\n",
	" return max(\n",
	" self.left_child.max_depth() + 1, \n",
	" self.right_child.max_depth() + 1\n",
	" ) \n",
	"\n",
	" def span(self, spectrogram):\n",
	" return spectrogram[self.start:self.stop]\n",
	"\n",
	" def mk_label(self, spectrogram):\n",
	" '''\n",
	" Build a label from different spans\n",
	" if the span is larger than the desired label: max-pool down to desired shape excess is cut off \n",
	" if the span is smaller than the desired label: pad with zeros \n",
	" '''\n",
	" T = self.span_length\n",
	" n_times = max(int(T / LAB), 1) \n",
	" large = torch.max_pool2d(spectrogram[:, :,self.start:self.stop, :], (n_times, 1))\n",
	" y = large[0,0,0:LAB,:]\n",
	" return y.reshape(-1)\n",
	"\n",
	" @classmethod\n",
	" def bfs_tree(cls, data, feature_extractor, grammar):\n",
	" '''\n",
	" Breadth first search merging using a feature extractor and a grammar\n",
	"\n",
	" returns all possible results sorted ascending by reconstruction error \n",
	"\n",
	" data: a multi dimensional sequence (1,1,T,D)\n",
	" feature extractor: a neural net converting the sequence (1,1,T,D) to another sequence (t,d)\n",
	" grammar: a neural network that operates on trees\n",
	" '''\n",
	" base = feature_extractor(data) \n",
	" T, D = base.shape\n",
	" leafs = [cls(i * WIN, (i + 1) * WIN, base[i, :]) for i in range(T)] \n",
	" queue = [MergingState(leafs, 0.0)]\n",
	" results = []\n",
	" while len(queue) > 0:\n",
	" mstate = queue[0]\n",
	" queue = queue[1:]\n",
	" if mstate.done():\n",
	" results.append(mstate)\n",
	" else:\n",
	" state, rmse = mstate\n",
	" candidates = []\n",
	" for i in range(1, len(state)):\n",
	" next_state = state.copy() \n",
	" hypothesis = cls(next_state[i - 1].start, next_state[i].stop) \n",
	" hypothesis.label = hypothesis.mk_label(data)\n",
	" hypothesis.left_child = next_state[i - 1]\n",
	" hypothesis.right_child = next_state[i]\n",
	" representation, score = grammar(hypothesis)\n",
	" hypothesis.representation = representation\n",
	" next_state[i - 1:i + 1] = [hypothesis]\n",
	" candidates.append(MergingState(next_state, rmse + score)) \n",
	" queue.extend(sorted(candidates, key=attrgetter('rmse'))[0:BEAM_WIDTH])\n",
	" return sorted(results, key=attrgetter('rmse'))\n",
	"\n",
	" @classmethod\n",
	" def greedy_tree(cls, data, feature_extractor, grammar):\n",
	" '''\n",
	" Greedily merges bottom up using a feature extractor and grammar\n",
	"\n",
	" data: a multi dimensional sequence (1,1,T,D)\n",
	" feature extractor: a neural net converting the sequence (1,1,T,D) to another sequence (t,d)\n",
	" grammar: a neural network that operates on trees\n",
	" '''\n",
	" base = feature_extractor(data) \n",
	" T, D = base.shape\n",
	" leafs = [cls(i * WIN, (i + 1) * WIN, base[i, :]) for i in range(T)] \n",
	" rmse = torch.zeros(1)\n",
	" while len(leafs) >= 2:\n",
	" min_score = float('inf')\n",
	" min_hypothesis = None \n",
	" min_idx = None \n",
	" for i in range(1, len(leafs)):\n",
	" # difference of reconstruction of the center of the span of the subtree in the spectrogram is the score\n",
	" # inspired by continuous bag of words\n",
	" hypothesis = cls(leafs[i - 1].start, leafs[i].stop)\n",
	" hypothesis.label = hypothesis.mk_label(data)\n",
	" hypothesis.left_child = leafs[i - 1]\n",
	" hypothesis.right_child = leafs[i]\n",
	" representation, score = grammar(hypothesis)\n",
	" hypothesis.representation = representation\n",
	" if score < min_score:\n",
	" min_score = score\n",
	" min_hypothesis = hypothesis\n",
	" min_idx = i \n",
	" rmse += min_score\n",
	" leafs[min_idx - 1:min_idx + 1] = [min_hypothesis] \n",
	" rmse = torch.sqrt(rmse / LAB)\n",
	" return leafs[0], rmse\n",
	" \n",
	"\n",
	"def parse(input, fe, grammar):\n",
	" return TreeNode.greedy_tree(input, fe, grammar) \n",
	"\n",
	" \n",
	"def learn_parser(inputs, epochs=25): \n",
	" '''\n",
	" Learns a parsing neural network along with the features\n",
	" \n",
	" inputs: a set of sequences [(1, 1, T, D)]\n",
	" epochs: number of training epochs\n",
	" '''\n",
	" fe = FeatureExtractor()\n",
	" grammar = NeuralGrammar() \n",
	" optimizer = torch.optim.Adam(list(fe.parameters()) + list(grammar.parameters()))\n",
	" trees = []\n",
	" rmse_total = None\n",
	" for epoch in range(0, epochs):\n",
	" rmse_total = torch.ones(1) \n",
	" trees = []\n",
	" # E - Step: Building structure \n",
	" for i in inputs:\n",
	" root, score = TreeNode.bfs_tree(i, fe, grammar)[0].root() # TreeNode.greedy_tree(i, fe, grammar) \n",
	" trees.append(root)\n",
	" rmse_total += score\n",
	" # M - Step: Propagate error (collects all gradients first in that version)\n",
	" rmse_total = rmse_total / len(inputs)\n",
	" rmse_total.backward() \n",
	" clip_grad_norm(fe.parameters(), 1.0, norm_type=2.)\n",
	" clip_grad_norm(grammar.parameters(), 1.0, norm_type=2.)\n",
	" loss=optimizer.step()\n",
	" print(\"EPOCH: {}, RMSE: {}\".format(epoch, float(rmse_total)))\n",
	" return trees, fe, grammar, rmse_total\n",
	"\n",
	"\n",
	"def fwd_spectrogram(audio, win=1024, step=512):\n",
	" '''\n",
	" Compute the spectrogram of audio data\n",
	"\n",
	" audio: one channel audio\n",
	" win: window size for dft sliding window\n",
	" step: step size for dft sliding windo\n",
	" '''\n",
	" spectrogram = []\n",
	" hanning = np.hanning(win)\n",
	" for i in range(win, len(audio), step):\n",
	" dft = fft(audio[i - win: i])\n",
	" spectrogram.append(dft)\n",
	" return np.array(spectrogram)\n",
	"\n",
	"\n",
	"def spectrogram_from_file(filename, max_len=1000):\n",
	" '''\n",
	" Read audio and convert to z-normalized spectrogram \n",
	"\n",
	" filename: path to the file\n",
	" max_len: clip files\n",
	" '''\n",
	" sound = Sound(filename) \n",
	" data = sound.data\n",
	" if len(data.shape) > 1:\n",
	" data = data[:, 0] \n",
	" spec = np.abs(fwd_spectrogram(data))[:,512:1024 - 100]\n",
	" mu = np.mean(spec) \n",
	" std = np.std(spec) + 1.0 \n",
	" spec = (spec - mu) / std\n",
	" spec = spec[0:max_len, :]\n",
	" t, d = spec.shape\n",
	" spec = spec.reshape(1, 1, t, d)\n",
	" spec = torch.tensor(spec.astype(np.float32))\n",
	" print(\"Read File: {} with dim {}\".format(filename, spec.shape)) \n",
	" return spec "
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "xZTYuTA7iPPl",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"files = []\n",
	"for filename in os.listdir(DKOHL_PATH_2012):\n",
	" if (filename.endswith('m4a') or filename.endswith('.wav')) and not filename.startswith('._'):\n",
	" files.append(\"{}/{}\".format(DKOHL_PATH_2012, filename))\n",
	"\n",
	"for r, d, f in os.walk(DKOHL_PATH_CTX):\n",
	" for file in f:\n",
	" if (file.endswith('m4a') or file.endswith('.wav')) and not file.startswith('._'):\n",
	" files.append(os.path.join(r, file))\n",
	"print(files)"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "lgJ4xpRbaHOQ",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"trees, fe, grammar, scores = learn_parser([torch.tensor(spectrogram_from_file(file, 10000)) for file in files[0:1]], epochs=100)"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "DktDaGaRZf6i",
	"colab_type": "text"
	},
	"source": [
	"## if we want to see the whole autograd graph"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "i53KD76leYzM",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"from torchviz import make_dot\n",
	"x = make_dot(scores)\n",
	"x.save('{}/graph.dot'.format(DKOHL))"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "SMFvpCzyRjqo",
	"colab_type": "text"
	},
	"source": [
	"## Load and Save"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "HNY8BJRAi5k1",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"torch.save(fe.state_dict(), '{}/convolve.pt'.format(DKOHL))\n",
	"torch.save(grammar.state_dict(), '{}/grammar.pt'.format(DKOHL))"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "CtP12Txt_Pv7",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"fe = FeatureExtractor()\n",
	"fe.load_state_dict(torch.load('{}/convolve.pt'.format(DKOHL)))\n",
	"grammar = NeuralGrammar()\n",
	"grammar.load_state_dict(torch.load('{}/grammar.pt'.format(DKOHL)))"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "pLKOIsm1ZkoY",
	"colab_type": "text"
	},
	"source": [
	"## Visualize Filters"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "l39xTUZCpMTx",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"params = list(fe.parameters())\n",
	"k = 1\n",
	"plt.figure(figsize=(80, 50))\n",
	"for i in range(CONV):\n",
	" plt.subplot(1, CONV, k)\n",
	" cur_axes = plt.gca()\n",
	" cur_axes.axes.get_xaxis().set_ticklabels([])\n",
	" cur_axes.axes.get_yaxis().set_ticklabels([])\n",
	" filter = np.array(params[0][i,0,:,:].data)\n",
	" plt.imshow(filter.T)\n",
	" k += 1\n",
	"plt.savefig('{}/features.png'.format(DKOHL))\n",
	"plt.show()"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "oVR1plFTrBfT",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"plt.figure(figsize=(15, 100))\n",
	"k = 1\n",
	"for i in range(10):\n",
	" x = spectrogram_from_file(files[i], 10000)\n",
	" plt.subplot(20, 1, k)\n",
	" plt.imshow(1.0 - x.detach().numpy()[0,0,:,:].T, cmap='gray', aspect=\"auto\")\n",
	" cur_axes = plt.gca()\n",
	" cur_axes.axes.get_xaxis().set_ticklabels([])\n",
	" cur_axes.axes.get_yaxis().set_ticklabels([])\n",
	" k += 1\n",
	" plt.subplot(20, 1, k)\n",
	" base = fe(x)\n",
	" plt.imshow(base.detach().numpy().T,aspect=\"auto\")\n",
	" cur_axes = plt.gca()\n",
	" cur_axes.axes.get_xaxis().set_ticklabels([])\n",
	" cur_axes.axes.get_yaxis().set_ticklabels([])\n",
	" k += 1\n",
	"plt.show()"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "3C_8fMhVemnh",
	"colab_type": "text"
	},
	"source": [
	"## Visualize Trees"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "WTamfK-rdtyE",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"def tree_walker(node, spectrogram, path, json = \"\", name = 'r'):\n",
	" if not os.path.exists(path):\n",
	" os.mkdir(path)\n",
	" start = node.start\n",
	" stop = node.stop\n",
	" spec = spectrogram[start:stop, :]\n",
	" plt.imshow(1.0 - spec.T, cmap='gray') \n",
	" cur_axes = plt.gca()\n",
	" cur_axes.axes.get_xaxis().set_ticklabels([])\n",
	" cur_axes.axes.get_yaxis().set_ticklabels([])\n",
	" plt.savefig('{}/{}.png'.format(path, name))\n",
	" plt.close()\n",
	"\n",
	" image = '{}.png'.format(name)\n",
	" if not node.is_leaf():\n",
	" left = tree_walker(node.left_child, spectrogram, path, json, name + '0')\n",
	" right = tree_walker(node.right_child, spectrogram, path, json, name + '1')\n",
	" return '{{image: \"{}\", children: [{}, {}]}}'.format(image, left, right)\n",
	" else:\n",
	" return '{{image: \"{}\"}}'.format(image)\n",
	"\n",
	"def write_tree_json(i, path):\n",
	" json = tree_walker(trees[i], spectrogram_from_file(files[i], 10000)[0,0,:,:].detach().numpy(), path)\n",
	" return \"\"\" \n",
	" var chart_config = {{ \n",
	" chart: {{container: \"#dolphin\"}},\n",
	" nodeStructure: {} \n",
	" }};\n",
	" \"\"\".format(json)\n",
	"\n",
	"print(write_tree_json(1, '{}/22092019/'.format(DKOHL)))"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "zqSlzTYGQWs2",
	"colab_type": "text"
	},
	"source": [
	"Paste into file\n",
	"\n",
	"```\n",
	"<!DOCTYPE html>\n",
	"<html>\n",
	" <head>\n",
	" <meta charset=\"utf-8\">\n",
	" <meta http-equiv=\"X-UA-Compatible\" content=\"IE=edge,chrome=1\">\n",
	" <meta name=\"viewport\" content=\"width=device-width\">\n",
	" <title> Basic example </title>\n",
	" <link rel=\"stylesheet\" href=\"../../Treant.css\">\n",
	" <link rel=\"stylesheet\" href=\"basic-example.css\">\n",
	"\n",
	"</head>\n",
	"<body>\n",
	" <div id=\"dolphin\"></div>\n",
	" <script src=\"../../vendor/raphael.js\"></script>\n",
	" <script src=\"../../Treant.js\"></script>\n",
	" <script src=\"dolphin.js\"></script>\n",
	" <script>\n",
	" new Treant( chart_config );\n",
	" </script>\n",
	"</body>\n",
	"</html>\n",
	"```"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "Dm7JOkMde55i",
	"colab_type": "text"
	},
	"source": [
	"## Visualize Span Clusters"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "RhZQjWE_e-7U",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"def clustering_candidates(tree, min_size, max_size):\n",
	" representations = []\n",
	" if tree.span_length >= min_size and tree.span_length <= max_size:\n",
	" representations = [tree]\n",
	" if not tree.is_leaf():\n",
	" representations += clustering_candidates(tree.left_child, min_size, max_size)\n",
	" representations += clustering_candidates(tree.right_child, min_size, max_size) \n",
	" return representations\n",
	"\n",
	"representations = []\n",
	"for file in files: \n",
	" representations += [(file, r) for r in clustering_candidates(parse(spectrogram_from_file(file, 10000), fe, grammar)[0], 50, 250)] \n",
	" print(len(representations))"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "O-DSDp6Eu4Wt",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"k = 256\n",
	"x = np.array([r.representation.detach().numpy() for _, r in representations])\n",
	"km = KMeans(n_clusters=k)\n",
	"y = km.fit_predict(x)\n",
	"print(\"Train: {}, Test: {}\".format(x.shape, y.shape))\n",
	"print(y)"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "WvfvZzqawsjF",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"import warnings\n",
	"warnings.filterwarnings(\"ignore\")\n",
	"for c in range(0, 256):\n",
	" n = 1\n",
	" for i in range(x.shape[0]):\n",
	" if c == y[i]:\n",
	" n += 1\n",
	" if n >= 3 and n < 20:\n",
	" k = 1\n",
	" for i in range(x.shape[0]):\n",
	" if c == y[i]:\n",
	" f = plt.subplot(1, n, k)\n",
	" (f, r) = representations[i]\n",
	" start = r.start\n",
	" stop = r.stop\n",
	" spectrogram = spectrogram_from_file(f, 10000)[0,0,start:stop,:].detach().numpy()\n",
	" plt.imshow(1.0 - spectrogram.T, cmap='gray', aspect=\"auto\") \n",
	" cur_axes = plt.gca()\n",
	" cur_axes.axes.get_xaxis().set_ticklabels([])\n",
	" cur_axes.axes.get_yaxis().set_ticklabels([])\n",
	" k += 1\n",
	" plt.show()\n"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "74Kq_pWSQQ0r",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	""
	],
	"execution_count": 0,
	"outputs": []
	}
	]
	}