hccho2/tikz-examples.tex

## tikz-examples.tex
\documentclass[]{article}
\usepackage[margin=1cm]{geometry}
\usepackage{tikz,pgfplots,pgf}
\usepackage{neuralnetwork}
\usepackage{sidecap}
\usepackage{amsmath}

\usetikzlibrary{matrix,shapes,arrows,positioning}
\usetikzlibrary{chains,decorations.pathreplacing}
\usetikzlibrary{backgrounds}
\usetikzlibrary{arrows.meta,tikzmark}
\usetikzlibrary{fadings}
\def\layersep{2.5cm}


\usepackage{xcolor}
\definecolor{fc}{HTML}{1E90FF}
\definecolor{h}{HTML}{228B22}
\definecolor{bias}{HTML}{87CEFA}
\definecolor{noise}{HTML}{8B008B}
\definecolor{conv}{HTML}{FFA500}
\definecolor{pool}{HTML}{B22222}
\definecolor{up}{HTML}{B22222}
\definecolor{view}{HTML}{FFFFFF}
\definecolor{bn}{HTML}{FFD700}
\tikzset{fc/.style={black,draw=black,fill=fc,rectangle,minimum height=1cm}}
\tikzset{h/.style={black,draw=black,fill=h,rectangle,minimum height=1cm}}
\tikzset{bias/.style={black,draw=black,fill=bias,rectangle,minimum height=1cm}}
\tikzset{noise/.style={black,draw=black,fill=noise,rectangle,minimum height=1cm}}
\tikzset{conv/.style={black,draw=black,fill=conv,rectangle,minimum height=1cm}}
\tikzset{pool/.style={black,draw=black,fill=pool,rectangle,minimum height=1cm}}
\tikzset{up/.style={black,draw=black,fill=up,rectangle,minimum height=1cm}}
\tikzset{view/.style={black,draw=black,fill=view,rectangle,minimum height=1cm}}
\tikzset{bn/.style={black,draw=black,fill=bn,rectangle,minimum height=1cm}}


\DeclareMathOperator{\ReLU}{ReLU}
\newcommand{\KL}{\ensuremath{\mathrm{KL}}}
\newcommand{\Ber}{\ensuremath{\mathrm{Ber}}}

\usepackage{xspace}
\newcommand*{\eg}{\emph{e.g.}\@\xspace}
\newcommand*{\Eg}{\emph{E.g.}\@\xspace}
\newcommand*{\ie}{\emph{i.e.}\@\xspace}
\newcommand*{\etc}{\emph{etc.}\@\xspace}
\newcommand*{\etal}{\emph{et al.}\@\xspace}
\newcommand*{\cf}{\emph{cf.}\@\xspace}
\newcommand*{\vs}{\emph{vs.}\@\xspace}


\newcommand\numRowsK{3}
\newcommand\numColsK{3}
\newcommand{\K}[2]{% #1: row, #2: col
    \edef\Kcol##1##2##3{###2}%
    \edef\Krow##1##2##3{\noexpand\Kcol###1}%
    \Krow
        {1 0 1}
        {0 1 0}
        {1 0 1}%
}


\begin{document}

\begin{figure}[htp]
\centering
\begin{tikzpicture}[
plain/.style={
  draw=none,
  fill=none,
  },
dot/.style={draw,shape=circle,minimum size=3pt,inner sep=0,fill=black
  },
net/.style={
  matrix of nodes,
  nodes={
    draw,
    circle,
    inner sep=8.5pt
    },
  nodes in empty cells,
  column sep=0.6cm,
  row sep=-11pt
  },
>=latex
]
\matrix[net] (mat)
{
|[plain]| \parbox{1cm}{\centering Input\\layer}
          & |[plain]| \parbox{1cm}{\centering Hidden\\layer}
                       & |[plain]| \parbox{1cm}{\centering Output\\layer} \\
          & |[plain]|                 \\
|[plain]| &            & |[plain]|    \\
          & |[plain]|  &              \\
|[plain]| & |[dot]|                   \\
          & |[plain]|  & |[dot]|      \\
|[plain]| & |[dot]|    & |[plain]|    \\
|[dot]|   & |[plain]|  & |[dot]|      \\
|[dot]|   & |[dot]|    & |[plain]|    \\
|[dot]|   & |[plain]|  &              \\
|[plain]| &            & |[plain]|    \\
          & |[plain]|                 \\
};
\foreach \ai/\mi in {2/I1,4/I2,6/I3,12/In}
  \draw[<-] (mat-\ai-1) -- node[above] {\mi} +(-1cm,0);
\foreach \ai in {2,4,6,12}
{\foreach \aii/\mii in {3/H1,11/Hn}
  \draw[->] (mat-\ai-1) -- (mat-\aii-2) node[yshift=0.6cm] {\mii};
}
\foreach \ai in {3,11}
{  \draw[->] (mat-\ai-2) -- (mat-4-3);
  \draw[->] (mat-4-3) -- node[above] {O1} +(1cm,0);}
\foreach \ai in {3,11}
{  \draw[->] (mat-\ai-2) -- (mat-10-3);
  \draw[->] (mat-10-3) -- node[above] {On} +(1cm,0);}
\end{tikzpicture}

\caption{ANN diagram for Speed Sign recognition.}
\label{fig_m_3}
\end{figure}

\begin{center}
\begin{neuralnetwork}[height=4]
		\newcommand{\x}[2]{$x_#2$}
		\newcommand{\y}[2]{$\hat{y}_#2$}
		\newcommand{\hfirst}[2]{\small $h^{(1)}_#2$}
		\newcommand{\hsecond}[2]{\small $h^{(2)}_#2$}
		\inputlayer[count=3, bias=true, title=Input\\layer, text=\x]
		\hiddenlayer[count=4, bias=false, title=Hidden\\layer 1, text=\hfirst] \linklayers
		\hiddenlayer[count=3, bias=false, title=Hidden\\layer 2, text=\hsecond] \linklayers
		\outputlayer[count=2, title=Output\\layer, text=\y] \linklayers
\end{neuralnetwork}
\end{center}


\begin{center}
\begin{tikzpicture}[
    % define styles
    init/.style={
         draw,
         circle,
         inner sep=2pt,
         font=\Huge,
         join = by -latex
    },
    squa/.style={
        font=\Large,
        join = by -latex
    }
]
% Top chain x1 to w1
\begin{scope}[start chain=1]
    \node[on chain=1] at (0,1.5cm)  (x1) {$x_1$};
    \node[on chain=1,join=by o-latex] (w1) {$w_1$};
\end{scope}
% Middle chain x2 to output
\begin{scope}[start chain=2]
    \node[on chain=2] (x2) {$x_2$};
    \node[on chain=2,join=by o-latex] {$w_2$};
    \node[on chain=2,init] (sigma) {$\displaystyle\Sigma$};
    \node[on chain=2,squa,label=above:{\parbox{2cm}{\centering Activation\\ function}}]   {$f_{act}$};
    \node[on chain=2,squa,label=above:Output,join=by -latex] {$y_{out}$};
\end{scope}
% Bottom chain x3 to w3
\begin{scope}[start chain=3]
    \node[on chain=3] at (0,-1.5cm)
    (x3) {$x_3$};
    \node[on chain=3,label=below:Weights,join=by o-latex]
    (w3) {$w_3$};
\end{scope}
% Bias
\node[label=above:\parbox{2cm}{\centering Bias \\ $b$}] at (sigma|-w1) (b) {};
% Arrows joining w1, w3 and b to sigma
\draw[-latex] (w1) -- (sigma);
\draw[-latex] (w3) -- (sigma);
\draw[o-latex] (b) -- (sigma);
% left hand side brace
\draw[decorate,decoration={brace,mirror}] (x1.north west) -- node[left=10pt] {Inputs} (x3.south west);

\end{tikzpicture}
\end{center}


\begin{center}
\begin{tikzpicture}[shorten >=1pt,->,draw=black!50, node distance=\layersep]
    \tikzstyle{every pin edge}=[<-,shorten <=1pt]
    \tikzstyle{neuron}=[circle,fill=black!25,minimum size=17pt,inner sep=0pt]
    \tikzstyle{input neuron}=[neuron, fill=green!50];
    \tikzstyle{output neuron}=[neuron, fill=red!50];
    \tikzstyle{hidden neuron}=[neuron, fill=blue!50];
    \tikzstyle{annot} = [text width=4em, text centered]

    % Draw the input layer nodes
    \foreach \name / \y in {1,...,4}
    % This is the same as writing \foreach \name / \y in {1/1,2/2,3/3,4/4}
        \node[input neuron, pin=left:Input \#\y] (I-\name) at (0,-\y) {};

    % Draw the hidden layer nodes
    \foreach \name / \y in {1,...,5}
        \path[yshift=0.5cm]
            node[hidden neuron] (H-\name) at (\layersep,-\y cm) {};

    % Draw the output layer node
    \node[output neuron,pin={[pin edge={->}]right:Output}, right of=H-3] (O) {};

    % Connect every node in the input layer with every node in the
    % hidden layer.
    \foreach \source in {1,...,4}
        \foreach \dest in {1,...,5}
            \path (I-\source) edge (H-\dest);

    % Connect every node in the hidden layer with the output layer
    \foreach \source in {1,...,5}
        \path (H-\source) edge (O);

    % Annotate the layers
    \node[annot,above of=H-1, node distance=1cm] (hl) {Hidden layer};
    \node[annot,left of=hl] {Input layer};
    \node[annot,right of=hl] {Output layer};
\end{tikzpicture}
\end{center}
% End of code

% \usepackage{sidecap}
\begin{SCfigure}[2\sidecaptionrelwidth][t]
	\centering
	\begin{tikzpicture}[shorten >=1pt,->]
		\tikzstyle{unit}=[draw,shape=circle,minimum size=1.15cm]

		\node[unit](p) at (2,1){$y$};
		\node(dots) at (-0.25,1){\vdots};

		\draw (0,2.5) node[xshift=-10]{$w_0$} -- (p);
		\draw (0,1.75) node[xshift=-10]{$x_1$} --(p);
		\draw (0,0) node[xshift=-10]{$x_D$} -- (p);
		\draw (p) -- (3,1) node[xshift=30]{$y := f(z)$};
	\end{tikzpicture}
	\caption[Single processing units and its components.]{Single processing unit and its components. The activation function is denoted by $f$ and applied on the actual input $z$ of the unit to form its output $y = f(z)$. $x_1, \ldots, x_D$ represent input from other units within the network; $w_0$ is called bias and represents an external input to the unit. All inputs are mapped onto the actual input $z$ using the propagation rule.}
	\label{fig:processing-unit}
\end{SCfigure}


\begin{SCfigure}[12]
	\centering
    \begin{tikzpicture}[shorten >=1pt]
        \tikzstyle{unit}=[draw,shape=circle,minimum size=1.15cm]

        \node[unit](x0) at (0,3.5){$x_0$};
        \node[unit](x1) at (0,2){$x_1$};
        \node(dots) at (0,1){\vdots};
        \node[unit](xd) at (0,0){$x_D$};

        \node[unit](y1) at (4,2.5){$y_1$};
        \node(dots) at (4,1.5){\vdots};
        \node[unit](yc) at (4,0.5){$y_C$};

        \draw[->] (x0) -- (y1);
        \draw[->] (x0) -- (yc);

        \draw[->] (x1) -- (y1);
        \draw[->] (x1) -- (yc);

        \draw[->] (xd) -- (y1);
        \draw[->] (xd) -- (yc);

        \draw [decorate,decoration={brace,amplitude=10pt},xshift=-4pt,yshift=0pt] (-0.5,4) -- (0.75,4) node [black,midway,yshift=+0.6cm]{input layer};
        \draw [decorate,decoration={brace,amplitude=10pt},xshift=-4pt,yshift=0pt] (3.5,3) -- (4.75,3) node [black,midway,yshift=+0.6cm]{output layer};
    \end{tikzpicture}
    \caption[Network graph of a perceptron with $D$ input units and $C$ output units.]{The perceptron consists of $D$ input units and $C$ output units. All units are labeled according to their output: $y_i = f(z_i)$ in the case of output units; $x_i$ in the case of input units. The input values $x_i$ are propagated to each output unit using the weighted sum propagation rule. The additional input value $x_0 := 1$ is used to include the biases as weights.}
    \label{fig:perceptron}
\end{SCfigure}


% \usetikzlibrary{fadings} ---> fading
\begin{figure}[t]
	\centering
	\begin{tikzpicture}[shorten >=1pt]
		\tikzstyle{unit}=[draw,shape=circle,minimum size=1.15cm]
		%\tikzstyle{hidden}=[draw,shape=circle,fill=black!25,minimum size=1.15cm]
		\tikzstyle{hidden}=[draw,shape=circle,minimum size=1.15cm]

		\node[unit](x0) at (0,3.5){$x_0$};
		\node[unit](x1) at (0,2){$x_1$};
		\node at (0,1){\vdots};
		\node[unit](xd) at (0,0){$x_D$};

		\node[hidden](h10) at (3,4){$y_0^{(1)}$};
		\node[hidden](h11) at (3,2.5){$y_1^{(1)}$};
		\node at (3,1.5){\vdots};
		\node[hidden](h1m) at (3,-0.5){$y_{m^{(1)}}^{(1)}$};

		\node(h22) at (5,0){};
		\node(h21) at (5,2){};
		\node(h20) at (5,4){};

		\node(d3) at (6,0){$\ldots$};
		\node(d2) at (6,2){$\ldots$};
		\node(d1) at (6,4){$\ldots$};

		\node(hL12) at (7,0){};
		\node(hL11) at (7,2){};
		\node(hL10) at (7,4){};

		\node[hidden](hL0) at (9,4){$y_0^{(L)}$};
		\node[hidden](hL1) at (9,2.5){$y_1^{(L)}$};
		\node at (9,1.5){\vdots};
		\node[hidden](hLm) at (9,-0.5){$y_{m^{(L)}}^{(L)}$};

		\node[unit](y1) at (12,3.5){$y_1^{(L+1)}$};
		\node[unit](y2) at (12,2){$y_2^{(L+1)}$};
		\node at (12,1){\vdots};
		\node[unit](yc) at (12,0){$y_C^{(L+1)}$};

		\draw[->] (x0) -- (h11);
		\draw[->] (x0) -- (h1m);

		\draw[->] (x1) -- (h11);
		\draw[->] (x1) -- (h1m);

		\draw[->] (xd) -- (h11);
		\draw[->] (xd) -- (h1m);

		\draw[->] (hL0) -- (y1);
		\draw[->] (hL0) -- (yc);
		\draw[->] (hL0) -- (y2);

		\draw[->] (hL1) -- (y1);
		\draw[->] (hL1) -- (yc);
		\draw[->] (hL1) -- (y2);

		\draw[->] (hLm) -- (y1);
		\draw[->] (hLm) -- (y2);
		\draw[->] (hLm) -- (yc);

		\draw[->,path fading=east] (h10) -- (h21);
		\draw[->,path fading=east] (h10) -- (h22);

		\draw[->,path fading=east] (h11) -- (h21);
		\draw[->,path fading=east] (h11) -- (h22);

		\draw[->,path fading=east] (h1m) -- (h21);
		\draw[->,path fading=east] (h1m) -- (h22);

		\draw[->,path fading=west] (hL10) -- (hL1);
		\draw[->,path fading=west] (hL11) -- (hL1);
		\draw[->,path fading=west] (hL12) -- (hL1);

		\draw[->,path fading=west] (hL10) -- (hLm);
		\draw[->,path fading=west] (hL11) -- (hLm);
		\draw[->,path fading=west] (hL12) -- (hLm);

		\draw [decorate,decoration={brace,amplitude=10pt},xshift=-4pt,yshift=0pt] (-0.5,4) -- (0.75,4) node [black,midway,yshift=+0.6cm]{input layer};
		\draw [decorate,decoration={brace,amplitude=10pt},xshift=-4pt,yshift=0pt] (2.5,4.5) -- (3.75,4.5) node [black,midway,yshift=+0.6cm]{$1^{\text{st}}$ hidden layer};
		\draw [decorate,decoration={brace,amplitude=10pt},xshift=-4pt,yshift=0pt] (8.5,4.5) -- (9.75,4.5) node [black,midway,yshift=+0.6cm]{$L^{\text{th}}$ hidden layer};
		\draw [decorate,decoration={brace,amplitude=10pt},xshift=-4pt,yshift=0pt] (11.5,4) -- (12.75,4) node [black,midway,yshift=+0.6cm]{output layer};
	\end{tikzpicture}
	\caption[Network graph for a $(L+1)$-layer perceptron.]{Network graph of a $(L+1)$-layer perceptron with $D$ input units and $C$ output units. The $l^{\text{th}}$ hidden layer contains $m^{(l)}$ hidden units.}
	\label{fig:multilayer-perceptron}
\end{figure}


\begin{SCfigure}[2\sidecaptionrelwidth][t!]
	\centering
	\begin{tikzpicture}[shorten >=1pt]
      		\tikzstyle{unit}=[draw,shape=circle,minimum size =1.4cm]

       	\node[unit](i) at (0,1){$y_i^{(l)}$};
        	\node[unit](k1) at (3,2){$y_1^{(l+1)}$};
		\node at (3, 1){$\vdots$};
		\node[unit](km) at (3,-0.25){$y_{m^{(l+1)}}^{(l+1)}$};

		\node at (1.25,2.25){$\delta_1^{(l+1)}$};
		\node at (1.25,-0.5){$\delta_{m^{(l+1)}}^{(l+1)}$};

        	\draw[->] (i) -- (k1);
		\draw[->] (i) -- (km);

		\draw[->,red,line width=0.05cm] (2,-0.25) -- (0.75,0.3);
		\draw[->,red,line width=0.05cm] (2,2) -- (0.75,1.6);
    	\end{tikzpicture}
	\caption[Backpropagation of errors through the network.]{Once evaluated for all output units, the errors $\delta_i^{(L+1)}$ can be propagated backwards.}.
	\label{fig:error-backpropagation}
\end{SCfigure}


\begin{figure}[t!]
	\centering
	\begin{tikzpicture}
		\node at (0.5,-1){\begin{tabular}{c}input image\\layer $l = 0$\end{tabular}};

		\draw (0,0) -- (1,0) -- (1,1) -- (0,1) -- (0,0);

		\node at (3,3.5){\begin{tabular}{c}convolutional layer\\with non-linearities\\layer $l = 1$\end{tabular}};

		\draw[fill=black,opacity=0.2,draw=black] (2.75,1.25) -- (3.75,1.25) -- (3.75,2.25) -- (2.75,2.25) -- (2.75,1.25);
		\draw[fill=black,opacity=0.2,draw=black] (2.5,1) -- (3.5,1) -- (3.5,2) -- (2.5,2) -- (2.5,1);
		\draw[fill=black,opacity=0.2,draw=black] (2.25,0.75) -- (3.25,0.75) -- (3.25,1.75) -- (2.25,1.75) -- (2.25,0.75);
		\draw[fill=black,opacity=0.2,draw=black] (2,0.5) -- (3,0.5) -- (3,1.5) -- (2,1.5) -- (2,0.5);
		\draw[fill=black,opacity=0.2,draw=black] (1.75,0.25) -- (2.75,0.25) -- (2.75,1.25) -- (1.75,1.25) -- (1.75,0.25);
		\draw[fill=black,opacity=0.2,draw=black] (1.5,0) -- (2.5,0) -- (2.5,1) -- (1.5,1) -- (1.5,0);

		\node at (4.5,-1){\begin{tabular}{c}subsampling layer\\layer $l = 3$\end{tabular}};

		\draw[fill=black,opacity=0.2,draw=black] (5,1.25) -- (5.75,1.25) -- (5.75,2) -- (5,2) -- (5,1.25);
		\draw[fill=black,opacity=0.2,draw=black] (4.75,1) -- (5.5,1) -- (5.5,1.75) -- (4.75,1.75) -- (4.75,1);
		\draw[fill=black,opacity=0.2,draw=black] (4.5,0.75) -- (5.25,0.75) -- (5.25,1.5) -- (4.5,1.5) -- (4.5,0.75);
		\draw[fill=black,opacity=0.2,draw=black] (4.25,0.5) -- (5,0.5) -- (5,1.25) -- (4.25,1.25) -- (4.25,0.5);
		\draw[fill=black,opacity=0.2,draw=black] (4,0.25) -- (4.75,0.25) -- (4.75,1) -- (4,1) -- (4,0.25);
		\draw[fill=black,opacity=0.2,draw=black] (3.75,0) -- (4.5,0) -- (4.5,0.75) -- (3.75,0.75) -- (3.75,0);

		\node at (7,3.5){\begin{tabular}{c}convolutional layer\\with non-linearities\\layer $l = 4$\end{tabular}};

		\draw[fill=black,opacity=0.2,draw=black] (7.5,1.75) -- (8.25,1.75) -- (8.25,2.5) -- (7.5,2.5) -- (7.5,1.75);
		\draw[fill=black,opacity=0.2,draw=black] (7.25,1.5) -- (8,1.5) -- (8,2.25) -- (7.25,2.25) -- (7.25,1.5);
		\draw[fill=black,opacity=0.2,draw=black] (7,1.25) -- (7.75,1.25) -- (7.75,2) -- (7,2) -- (7,1.25);
		\draw[fill=black,opacity=0.2,draw=black] (6.75,1) -- (7.5,1) -- (7.5,1.75) -- (6.75,1.75) -- (6.75,1);
		\draw[fill=black,opacity=0.2,draw=black] (6.5,0.75) -- (7.25,0.75) -- (7.25,1.5) -- (6.5,1.5) -- (6.5,0.75);
		\draw[fill=black,opacity=0.2,draw=black] (6.25,0.5) -- (7,0.5) -- (7,1.25) -- (6.25,1.25) -- (6.25,0.5);
		\draw[fill=black,opacity=0.2,draw=black] (6,0.25) -- (6.75,0.25) -- (6.75,1) -- (6,1) -- (6,0.25);
		\draw[fill=black,opacity=0.2,draw=black] (5.75,0) -- (6.5,0) -- (6.5,0.75) -- (5.75,0.75) -- (5.75,0);

		\node at (9.5,-1){\begin{tabular}{c}subsampling layer\\layer $l = 6$\end{tabular}};

		\draw[fill=black,opacity=0.2,draw=black] (10,1.75) -- (10.5,1.75) -- (10.5,2.25) -- (10,2.25) -- (10,1.75);
		\draw[fill=black,opacity=0.2,draw=black] (9.75,1.5) -- (10.25,1.5) -- (10.25,2) -- (9.75,2) -- (9.75,1.5);
		\draw[fill=black,opacity=0.2,draw=black] (9.5,1.25) -- (10,1.25) -- (10,1.75) -- (9.5,1.75) -- (9.5,1.25);
		\draw[fill=black,opacity=0.2,draw=black] (9.25,1) -- (9.75,1) -- (9.75,1.5) -- (9.25,1.5) -- (9.25,1);
		\draw[fill=black,opacity=0.2,draw=black] (9,0.75) -- (9.5,0.75) -- (9.5,1.25) -- (9,1.25) -- (9,0.75);
		\draw[fill=black,opacity=0.2,draw=black] (8.75,0.5) -- (9.25,0.5) -- (9.25,1) -- (8.75,1) -- (8.75,0.5);
		\draw[fill=black,opacity=0.2,draw=black] (8.5,0.25) -- (9,0.25) -- (9,0.75) -- (8.5,0.75) -- (8.5,0.25);
		\draw[fill=black,opacity=0.2,draw=black] (8.25,0) -- (8.75,0) -- (8.75,0.5) -- (8.25,0.5) -- (8.25,0);

		\node at (12,3.5){\begin{tabular}{c}fully connected layer\\layer $l = 7$\end{tabular}};

		\draw[fill=black,draw=black,opacity=0.5] (10.5,0) -- (11,0) -- (12.5,1.75) -- (12,1.75) -- (10.5,0);

		\node at (13,-1){\begin{tabular}{c}fully connected layer\\output layer $l = 8$\end{tabular}};

		\draw[fill=black,draw=black,opacity=0.5] (12.5,0.5) -- (13,0.5) -- (13.65,1.25) -- (13.15,1.25) -- (12.5,0.5);
	\end{tikzpicture}
	\caption[Architecture of a traditional convolutional neural network.]{The architecture of the original convolutional neural network, as introduced by LeCun et al. (1989), alternates between convolutional layers including hyperbolic tangent non-linearities and subsampling layers. In this illustration, the convolutional layers already include non-linearities and, thus, a convolutional layer actually represents two layers. The feature maps of the final subsampling layer are then fed into the actual classifier consisting of an arbitrary number of fully connected layers. The output layer usually uses softmax activation functions.}
	\label{fig:traditional-convolutional-network}
\end{figure}


\begin{SCfigure}[2\sidecaptionrelwidth][t]
	\centering
	\begin{tikzpicture}
		\node at (1.5,4){\begin{tabular}{c}input image\\or input feature map\end{tabular}};

		\draw (0,0) -- (3,0) -- (3,3) -- (0,3) -- (0,0);

		\draw (2,2) -- (2.5,2) -- (2.5,2.5) -- (2,2.5) -- (2,2);
		\draw (2,0.5) -- (2.5,0.5) -- (2.5,1) -- (2,1) -- (2,0.5);
		\draw (1,1) -- (1.5,1) -- (1.5,1.5) -- (1,1.5) -- (1,1);

		\draw (2.5,2) -- (7,3.25);
		\draw (2.5,2.5) -- (7,3.25);

		\draw (2.5,1) -- (5.75,0.25);
		\draw (2.5,0.5) -- (5.75,0.25);

		\draw (1.5,1.5) -- (5.5,1.25);
		\draw (1.5,1) -- (5.5,1.25);

		\node at (5.75,4){\begin{tabular}{c}output feature maps\end{tabular}};

		\draw[fill=black,opacity=0.2,draw=black] (5.5,1.5) -- (7.5,1.5) -- (7.5,3.5) -- (5.5,3.5) -- (5.5,1.5);
		\draw[fill=black,opacity=0.2,draw=black] (5,1) -- (7,1) -- (7,3) -- (5,3) -- (5,1);
		\draw[fill=black,opacity=0.2,draw=black] (4.5,0.5) -- (6.5,0.5) -- (6.5,2.5) -- (4.5,2.5) -- (4.5,0.5);
		\draw[fill=black,opacity=0.2,draw=black] (4,0) -- (6,0) -- (6,2) -- (4,2) -- (4,0);
	\end{tikzpicture}
	\caption[Illustration of a convolutional layer.]{Illustration of a single convolutional layer. If layer $l$ is a convolutional layer, the input image (if $l = 1$) or a feature map of the previous layer is convolved by different filters to yield the output feature maps of layer $l$.}
	\label{fig:convolutional-layer}
\end{SCfigure}


\begin{SCfigure}[2\sidecaptionrelwidth][t!]
	\centering
	\begin{tikzpicture}
		\node at (1.75,4.5){\begin{tabular}{c}feature maps\\layer $(l-1)$\end{tabular}};

		\draw[fill=black,opacity=0.2,draw=black] (1.5,1.5) -- (3.5,1.5) -- (3.5,3.5) -- (1.5,3.5) -- (1.5,1.5);
		\draw[fill=black,opacity=0.2,draw=black] (1,1) -- (3,1) -- (3,3) -- (1,3) -- (1,1);
		\draw[fill=black,opacity=0.2,draw=black] (0.5,0.5) -- (2.5,0.5) -- (2.5,2.5) -- (0.5,2.5) -- (0.5,0.5);
		\draw[fill=black,opacity=0.2,draw=black] (0,0) -- (2,0) -- (2,2) -- (0,2) -- (0,0);

		\draw (3.1,3.1) -- (3.4,3.1) -- (3.4,3.4) -- (3.1,3.4) -- (3.1,3.1);
		\draw (2.6,1.1) -- (2.9,1.1) -- (2.9,1.4) -- (2.6,1.4) -- (2.6,1.1);
		\draw (1.1,0.1) -- (1.4,0.1) -- (1.4,0.4) -- (1.1,0.4) -- (1.1,0.1);

		\draw (3.4,3.4) -- (7.8,2.8);
		\draw (3.4,3.1) -- (7.8,2.8);

		\draw (2.9,1.4) -- (7.3,1.2);
		\draw (2.9,1.1) -- (7.3,1.2);

		\draw (1.4,0.4) -- (5.9,0.3);
		\draw (1.4,0.1) -- (5.9,0.3);

		\node at (6.5,4.5){\begin{tabular}{c}feature maps\\layer $l$\end{tabular}};

		\draw[fill=black,opacity=0.2,draw=black] (6.5,1.5) -- (8,1.5) -- (8,3) -- (6.5,3) -- (6.5,1.5);
		\draw[fill=black,opacity=0.2,draw=black] (6,1) -- (7.5,1) -- (7.5,2.5) -- (6,2.5) -- (6,1);
		\draw[fill=black,opacity=0.2,draw=black] (5.5,0.5) -- (7,0.5) -- (7,2) -- (5.5,2) -- (5.5,0.5);
		\draw[fill=black,opacity=0.2,draw=black] (5,0) -- (6.5,0) -- (6.5,1.5) -- (5,1.5) -- (5,0);
	\end{tikzpicture}
	\caption[Illustration of a pooling and subsampling layer.]{Illustration of a pooling and subsampling layer. If layer $l$ is a pooling and subsampling layer and given $m_1^{(l-1)} = 4$ feature maps of the previous layer, all feature maps are pooled and subsampled individually. Each unit in one of the $m_1^{(l)} = 4$ output feature maps represents the average or the maximum within a fixed window of the corresponding feature map in layer $(l-1)$.}
	\label{fig:convolutional-layer}
\end{SCfigure}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\clearpage
\begin{figure}[t]
  \centering
  \begin{tikzpicture}
    \node (x) at (0.5,0) {$x$};
    \node[fc] (fc1) at (2,0) {\small$\text{fc}_{C_0, C_1}$};
    \node[bias] (b1) at (3.5,0) {\small$\text{bias}$};
    \node[h] (h1) at (4.5,0) {\small$h$};
    \node[fc] (fc2) at (5.75,0) {\small$\text{fc}_{C_1, C_2}$};
    \node[bias] (b2) at (7.25,0) {\small$\text{bias}$};
    \node[h] (h2) at (8.25,0) {\small$h$};
    \node[fc] (fc3) at (9.5,0) {\small$\text{fc}_{C_2, C_3}$};
    \node[bias] (b3) at (11,0) {\small$\text{bias}$};
    \node[h] (h3) at (12,0) {\small$h$};
    \node (y) at (13.5,0) {\small$y$};
    \draw[->] (x) -- (fc1);
    \draw[->] (fc1) -- (b1);
    \draw[->] (b1) -- (h1);
    \draw[->] (h1) -- (fc2);
    \draw[->] (fc2) -- (b2);
    \draw[->] (b2) -- (h2);
    \draw[->] (h2) -- (fc3);
    \draw[->] (fc3) -- (b3);
    \draw[->] (b3) -- (h3);
    \draw[->] (h3) -- (y);
  \end{tikzpicture}
  \vskip 0.5cm
  % TODO short caption
  % TODO parameters
  \caption[]{Illustration of a multi-layer perceptron with $L = 3$ fully-connected
  layers followed by bias layers and non-linearities. The sizes $C_1$ and $C_2$ are
  hyper-parameters while $C_0$ and $C_3$ are determined by the problem at hand.
Overall, the multi-layer perceptron represents a function $y(x;w)$ parameterized by
the weights $w$ in the fully-connected and bias layers.}
  \label{fig:deep-learning-mlp}
\end{figure}

\begin{figure}
  \centering
  \begin{tikzpicture}
    \node (x) at (1.25,0) {\small$x$};

    \node[fc,rotate=90,minimum width=2cm] (fc1) at (2.5,0) {\small$\text{fc}_{R, C_1}$};
    \node[bias,rotate=90,minimum width=2cm] (b1) at (3.75,0) {\small$\text{bias}$};
    \node[h,rotate=90,minimum width=2cm] (h1) at (5,0) {\small$h$};
    \node[fc,rotate=90,minimum width=2cm] (fc2) at (6.25,0) {\small$\text{fc}_{C_1, C_2}$};
    \node[bias,rotate=90,minimum width=2cm] (b2) at (7.5,0) {\small$\text{bias}$};
    \node[h,rotate=90,minimum width=2cm] (h2) at (8.75,0) {\small$h$};
    \node[fc,rotate=90,minimum width=2cm] (fc3) at (10,0) {\small$\text{fc}_{C_2, Q}$};
    \node[bias,rotate=90,minimum width=2cm] (b3) at (11.25,0) {\small$\text{bias}$};
    \node[h,rotate=90,minimum width=2cm] (h3) at (12.5,0) {\small$h$};

    \node (z) at (13.75,-2.5) {\small$z$};

    \node[h,rotate=90,minimum width=2cm] (h6) at (2.5,-2.5) {\small$h$};
    \node[bias,rotate=90,minimum width=2cm] (b6) at (3.75,-2.5) {\small$\text{bias}$};
    \node[fc,rotate=90,minimum width=2cm] (fc6) at (5,-2.5) {\small$\text{fc}_{C_1, R}$};
    \node[h,rotate=90,minimum width=2cm] (h5) at (6.25,-2.5) {\small$h$};
    \node[bias,rotate=90,minimum width=2cm] (b5) at (7.5,-2.5) {\small$\text{bias}$};
    \node[fc,rotate=90,minimum width=2cm] (fc5) at (8.75,-2.5) {\small$\text{fc}_{C_2, C_1}$};
    \node[h,rotate=90,minimum width=2cm] (h4) at (10,-2.5) {\small$h$};
    \node[bias,rotate=90,minimum width=2cm] (b4) at (11.25,-2.5) {\small$\text{bias}$};
    \node[fc,rotate=90,minimum width=2cm] (fc4) at (12.5,-2.5) {\small$\text{fc}_{Q, C_2}$};

    \node (rx) at (1.25,-2.5) {\small$\tilde{x}$};

    \draw[->] (x) -- (fc1);
    \draw[->] (fc1) -- (b1);
    \draw[->] (b1) -- (h1);
    \draw[->] (h1) -- (fc2);
    \draw[->] (fc2) -- (b2);
    \draw[->] (b2) -- (h2);
    \draw[->] (h2) -- (fc3);
    \draw[->] (fc3) -- (b3);
    \draw[->] (b3) -- (h3);
    \draw[-] (h3) -- (13.75,0);
    \draw[->] (13.75,0) -- (z);
    \draw[->] (z) -- (fc4);
    \draw[->] (fc4) -- (b4);
    \draw[->] (b4) -- (h4);
    \draw[->] (h4) -- (fc5);
    \draw[->] (fc5) -- (b5);
    \draw[->] (b5) -- (h5);
    \draw[->] (h5) -- (fc6);
    \draw[->] (fc6) -- (b6);
    \draw[->] (b6) -- (h6);
    \draw[->] (h6) -- (rx);

    \node[rotate=90] (L) at (1.25, -1.25) {\small$\mathcal{L}(\tilde{x}, x)$};
    \draw[-,dashed] (x) -- (L);
    \draw[-,dashed] (rx) -- (L);
  \end{tikzpicture}
  \vskip 0.5cm
  \caption{A simple variant of a multi-layer perceptron based auto-encoder.
  Both encoder (top) and decoder (bottom) consist of 3-layer perceptrons
  taking an $R$-dimensional
  input $x$. The parameters $C_1, C_2,$ and $Q$ can be chosen; $Q$ also
  determines the size of the latent code $z$ and is usually chosen significantly
  lower than $R$ such that the auto-encoder learns a dimensionality reduction.
  The non-linearity $h$ is also not fixed and might be determined experimentally.
  The reconstruction loss $\mathcal{L}(\tilde{x}, x)$ quantifies the quality of
  the reconstruction $\tilde{x}$ and is minimized during training.}
  \label{subfig:deep-learning-auto-encoder}
\end{figure}

\begin{figure}
  \centering
  \begin{tikzpicture}
    \node (x) at (1.25,0) {\small$x$};

    %\node[noise,rotate=90,minimum width=3.3cm] (noise1) at(1.25,0) {\small$\text{noise}_{\sigma^2}$};
    \node[conv,rotate=90,minimum width=3.3cm] (conv1) at (2.5,0) {\small$\text{conv}_{1, C_1, K}$\,+\,$\text{bias}$};
    %\node[bias,rotate=90,minimum width=3cm] (bias1) at (3.75,0) {$\text{bias}$};
    \node[h,rotate=90,minimum width=3.3cm] (h1) at (3.75,0) {\small$h$};
    \node[pool,rotate=90,minimum width=3.3cm] (pool1) at (5,0) {\small$\text{pool}_{2}$};

    \node[conv,rotate=90,minimum width=3.3cm] (conv2) at (6.25,0) {\small$\text{conv}_{C_1, C_2, K}$\,+\,$\text{bias}$};
    %\node[bias,rotate=90,minimum width=3cm] (bias2) at (8.75,0) {$\text{bias}$};
    \node[h,rotate=90,minimum width=3.3cm] (h2) at (7.5,0) {\small$h$};
    \node[pool,rotate=90,minimum width=3.3cm] (pool2) at (8.75,0) {\small$\text{pool}_{2}$};
    \node[view,rotate=90,minimum width=3.3cm] (view2) at (10,0) {\small$\text{view}_{B, C_3}$};
    \node[fc,rotate=90,minimum width=3.3cm] (fc2) at (11.25,0) {\small$\text{fc}_{C_3,Q}$};

    \node (z) at (12.5,-3.75) {\small$z$};

    \node[h,rotate=90,minimum width=3.3cm] (h4) at (6.25,-3.75) {\small$h$};
    %\node[bias,rotate=90,minimum width=3cm] (bias4) at (8.75,-3.75) {$\text{bias}$};
    \node[conv,rotate=90,minimum width=3.3cm] (conv4) at (7.5,-3.75) {\small$\text{conv}_{C_2, C_1, K}$\,+\,$\text{bias}$};
    \node[up,rotate=90,minimum width=3.3cm] (up4) at (8.75,-3.75) {\small$\text{nnup}_{2}$};
    \node[view,rotate=90,minimum width=3.3cm] (view4) at (10,-3.75) {\small$\text{view}_{B, C_2, \frac{H}{4}, \frac{W}{4}, \frac{D}{4}}$};
    \node[fc,rotate=90,minimum width=3.3cm] (fc4) at (11.25,-3.75) {\small$\text{fc}_{Q,C_3}$};

    \node[h,rotate=90,minimum width=3.3cm] (h5) at (2.5,-3.75) {\small$h$};
    %\node[bias,rotate=90,minimum width=3cm] (bias5) at (3.75,-4) {$\text{bias}$};
    \node[conv,rotate=90,minimum width=3.3cm] (conv5) at (3.75,-3.75) {\small$\text{conv}_{C_2, 1, K}$\,+\,$\text{bias}$};
    \node[up,rotate=90,minimum width=3.3cm] (up5) at (5,-3.75) {\small$\text{nnup}_{2}$};

    \node (rx) at (1.25,-3.75) {\small$\tilde{x}$};

    %\draw[->] (x) -- (noise1);
    \draw[->] (x) -- (conv1);
    \draw[->] (conv1) -- (h1);
    %\draw[->] (bias1) -- (h1);
    \draw[->] (h1) -- (pool1);

    \draw[->] (pool1) -- (conv2);
    \draw[->] (conv2) -- (h2);
    %\draw[->] (bias2) -- (h2);
    \draw[->] (h2) -- (pool2);
    \draw[->] (pool2) -- (view2);
    \draw[->] (view2) -- (fc2);

    \draw[-] (fc2) -- (12.5,0);
    \draw[->] (12.5,0) -- (z);
    \draw[->] (z) -- (fc4);

    \draw[->] (fc4) -- (view4);
    \draw[->] (view4) -- (up4);
    \draw[->] (up4) -- (conv4);
    %\draw[->] (conv4) -- (bias4);
    \draw[->] (conv4) -- (h4);

    \draw[->] (h4) -- (up5);
    \draw[->] (up5) -- (conv5);
    \draw[->] (conv5) -- (h5);
    %\draw[->] (bias5) -- (h5);
    \draw[->] (h5) -- (rx);

    \node[rotate=90] (L) at (1.25, -1.875) {\small$\mathcal{L}(\tilde{x}, x)$};
    \draw[-,dashed] (x) -- (L);
    \draw[-,dashed] (rx) -- (L);
  \end{tikzpicture}
  \vskip 0.5cm
  % TODO short caption
  \caption{Illustration of a convolutional auto-encoder consisting of encoder (top)
  and decoder (bottom). Both are modeled using two stages of convolutional
  layers each followed by a bias layer and a non-linearity layer. The encoder uses
  max pooling to decrease the spatial size of the input; the decoder uses upsampling
  to increase it again. The number of channels $C_1$, $C_2$ and $C_3$ as well as
  the size $Q$ are hyper parameters. We assume the input to comprise one channel.
  Again, the reconstruction loss $\mathcal{L}(\tilde{x}, x)$ quantifies the quality of
  the reconstruction and is minimized during training.}
  \label{fig:deep-learning-convolutional-auto-encoder}
\end{figure}


\clearpage

\begin{figure}
  \centering
  \hspace*{-0.75cm}
  \begin{tikzpicture}
    \node (y) at (0,0) {\small$y$};

    \node[conv,rotate=90,minimum width=4.5cm] (conv1) at (1.25,0) {\small$\text{conv}_{1, 16, 3}$\,+\,$\text{bn}$\,+\,$\ReLU$};
    \node[pool,rotate=90,minimum width=4.5cm] (pool1) at (2.5,0) {\small$\text{pool}_{2}$};
    \node[conv,rotate=90,minimum width=4.5cm] (conv2) at (3.75,0) {\small$\text{conv}_{16, 32, 3}$\,+\,$\text{bn}$\,+\,$\ReLU$};
    \node[pool,rotate=90,minimum width=4.5cm] (pool2) at (5,0) {\small$\text{pool}_{2}$};
    \node[conv,rotate=90,minimum width=4.5cm] (conv3) at (6.25,0) {\small$\text{conv}_{32, 64, 3}$\,+\,$\text{bn}$\,+\,$\ReLU$};
    \node[pool,rotate=90,minimum width=4.5cm] (pool3) at (7.5,0) {\small$\text{pool}_{2}$};
    \node[conv,rotate=90,minimum width=4.5cm] (conv4) at (8.75,0) {\small$\text{conv}_{64, 128, 3}$\,+\,$\text{bn}$\,+\,$\ReLU$};
    \node[pool,rotate=90,minimum width=4.5cm] (pool4) at (10,0) {\small$\text{pool}_{2}$};

    \node[view,rotate=90,minimum width=4.5cm] (view4) at (11.25,0) {\small$\text{view}_{B, 1024}$};

    \node[fc,rotate=90, minimum width = 2cm] (fc51) at (12.5,1.25) {\small$\text{fc}_{512, Q}$};
    \node[fc,rotate=90, minimum width = 2cm] (fc52) at (12.5,-1.25) {\small$\text{fc}_{512, Q}$};

    \node[view,rotate=90,minimum width=4.5cm] (kld) at (13.75,0) {\small$\text{KLD}_{\mathcal{N}}$\,+\,$\text{repa}_{\mathcal{N}}$};

    \node (z) at (13.75,-5){\small$\tilde{z}$};

    \node[fc,rotate=90,minimum width=4.5cm] (fc6) at (12.5,-5) {\small$\text{fc}_{Q, 512}$};
    \node[view,rotate=90,minimum width=4.5cm] (view6) at (11.25,-5) {\small$\text{view}_{B, 1024, 2, 2, 2}$};

    \node[up,rotate=90,minimum width=4.5cm] (up7) at (10,-5) {\small$\text{nnup}_{2}$};
    \node[conv,rotate=90,minimum width=4.5cm] (conv7) at (8.75,-5) {\small$\text{conv}_{128, 64, 3}$\,+\,$\text{bn}$\,+\,$\ReLU$};
    \node[up,rotate=90,minimum width=4.5cm] (up8) at (7.5,-5) {\small$\text{nnup}_{2}$};
    \node[conv,rotate=90,minimum width=4.5cm] (conv8) at (6.25,-5) {\small$\text{conv}_{64, 32, 3}$\,+\,$\text{bn}$\,+\,$\ReLU$};
    \node[up,rotate=90,minimum width=4.5cm] (up9) at (5,-5) {\small$\text{nnup}_{2}$};
    \node[conv,rotate=90,minimum width=4.5cm] (conv9) at (3.75,-5) {\small$\text{conv}_{32, 16, 3}$\,+\,$\text{bn}$\,+\,$\ReLU$};
    \node[up,rotate=90,minimum width=4.5cm] (up10) at (2.5,-5) {\small$\text{nnup}_{2}$};
    \node[conv,rotate=90,minimum width=4.5cm] (conv10) at (1.25,-5) {\small$\text{conv}_{16, 1, 3}$\,+\,$\text{bn}$\,+\,$h$};

    \node (ry) at (0,-5) {\small$\tilde{y}$};

    \draw[->] (y) -- (conv1);
    \draw[->] (conv1) -- (pool1);
    \draw[->] (pool1) -- (conv2);

    \draw[->] (conv2) -- (pool2);
    \draw[->] (pool2) -- (conv3);

    \draw[->] (conv3) -- (pool3);
    \draw[->] (pool3) -- (conv4);

    \draw[->] (conv4) -- (pool4);
    \draw[->] (pool4) -- (view4);

    \draw[->] (view4) -- (fc51);
    \draw[->] (view4) -- (fc52);

    \draw[->] (fc51) -- (kld);
    \draw[->] (fc52) -- (kld);

    \draw[->] (kld) -- (z);
    \draw[->] (z) -- (fc6);
    \draw[->] (fc6) -- (view6);

    \draw[->] (view6) -- (up7);
    \draw[->] (up7) -- (conv7);

    \draw[->] (conv7) -- (up8);
    \draw[->] (up8) -- (conv8);

    \draw[->] (conv8) -- (up9);
    \draw[->] (up9) -- (conv9);

    \draw[->] (conv9) -- (up10);
    \draw[->] (up10) -- (conv10);

    \draw[->] (conv10) -- (ry);

    \node[rotate=90] (L) at (0, -2.5) {\small$\mathcal{L}(\tilde{y}, y) = -\ln p(y | \tilde{z})$};
    \draw[-,dashed] (y) -- (L);
    \draw[-,dashed] (ry) -- (L);

    \node[rotate=90] (KLD) at (14.5, -5) {\small$\KL(q(z|y) | p(z))$};
    \draw[-,dashed] (KLD) -- (z);
  \end{tikzpicture}
  \vskip 0.5cm
  \caption{An illustration of a Gaussian variational auto-encoder with
  four convolutional stages in both encoder and decoder. This is also the
  architecture used in experiments in Chapter \ref{ch:experiments} where
  we assume volumes of size
  $32 \times 32 \times 32$ such that the spatial size just before
  the fully connected layers of the encoder is $2 \times 2 \times 2$
  resulting in a $1024$-dimensional representation when considering $128$
  channels.}
  \label{subfig:experiments-2d-architecture-vae}
\end{figure}

\begin{figure}
  \centering
  \hspace*{-0.75cm}
  \begin{tikzpicture}
    \node (x) at (0,0) {\small$x$};

    \node[conv,rotate=90,minimum width=4.5cm] (conv1) at (2.5,0) {\small$\text{conv}_{1, 16, 3}$\,+\,$\text{bn}$\,+\,$\ReLU$};
    \node[pool,rotate=90,minimum width=4.5cm] (pool1) at (3.75,0) {\small$\text{pool}_{2}$};
    \node[conv,rotate=90,minimum width=4.5cm] (conv2) at (5,0) {\small$\text{conv}_{16, 32, 3}$\,+\,$\text{bn}$\,+\,$\ReLU$};
    \node[pool,rotate=90,minimum width=4.5cm] (pool2) at (6.25,0) {\small$\text{pool}_{2}$};
    \node[conv,rotate=90,minimum width=4.5cm] (conv3) at (7.5,0) {\small$\text{conv}_{32, 64, 3}$\,+\,$\text{bn}$\,+\,$\ReLU$};
    \node[pool,rotate=90,minimum width=4.5cm] (pool3) at (8.75,0) {\small$\text{pool}_{2}$};
    \node[conv,rotate=90,minimum width=4.5cm] (conv4) at (10,0) {\small$\text{conv}_{64, 128, 3}$\,+\,$\text{bn}$\,+\,$\ReLU$};
    \node[pool,rotate=90,minimum width=4.5cm] (pool4) at (11.25,0) {\small$\text{pool}_{2}$};

    \node[view,rotate=90,minimum width=4.5cm] (view4) at (12.5,0) {\small$\text{view}_{B, 1024}$};

    \node[fc,rotate=90, minimum width = 2cm] (fc51) at (13.75,1.25) {\small$\text{fc}_{512, Q}$};
    \node[fc,rotate=90, minimum width = 2cm] (fc52) at (13.75,-1.25) {\small$\text{fc}_{512, Q}$};

    \node[view,rotate=90,minimum width=4.5cm] (kld) at (15,0) {\small$\text{KLD}_{\mathcal{N}}$\,+\,$\text{repa}_{\mathcal{N}}$};

    \node (z) at (15,-5){\small$\tilde{z}$};

    \node[fc,rotate=90,minimum width=4.5cm] (fc6) at (13.75,-5) {\small$\text{fc}_{Q, 512}$};
    \node[view,rotate=90,minimum width=4.5cm] (view6) at (12.5,-5) {\small$\text{view}_{B, 1024, 2, 2, 2}$};

    \node[up,rotate=90,minimum width=4.5cm] (up7) at (11.25,-5) {\small$\text{nnup}_{2}$};
    \node[conv,rotate=90,minimum width=4.5cm] (conv7) at (10,-5) {\small$\text{conv}_{128, 64, 3}$\,+\,$\text{bn}$\,+\,$\ReLU$};
    \node[up,rotate=90,minimum width=4.5cm] (up8) at (8.75,-5) {\small$\text{nnup}_{2}$};
    \node[conv,rotate=90,minimum width=4.5cm] (conv8) at (7.5,-5) {\small$\text{conv}_{64, 32, 3}$\,+\,$\text{bn}$\,+\,$\ReLU$};
    \node[up,rotate=90,minimum width=4.5cm] (up9) at (6.25,-5) {\small$\text{nnup}_{2}$};
    \node[conv,rotate=90,minimum width=4.5cm] (conv9) at (5,-5) {\small$\text{conv}_{32, 16, 3}$\,+\,$\text{bn}$\,+\,$\ReLU$};
    \node[up,rotate=90,minimum width=4.5cm] (up10) at (3.75,-5) {\small$\text{nnup}_{2}$};
    \node[conv,rotate=90,minimum width=4.5cm] (conv10) at (2.5,-5) {\small$\text{conv}_{16, 1, 3}$\,+\,$\text{bn}$\,+\,$h$};

    \node[view,rotate=90,minimum width=4.5cm] (kld2) at (1.25,-5) {\small$\text{KLD}_{\Ber}$\,+\,$\text{repa}_{\Ber}$\,+\,$\sigma$};
    \node (ry) at (0,-5) {\small$\tilde{y}$};

    \node[conv,rotate=90,minimum width=4.5cm] (conv11) at (1.25,-10) {\small$\text{conv}_{1, 16, 3}$\,+\,$\text{bn}$\,+\,$\ReLU$};
    \node[conv,rotate=90,minimum width=4.5cm] (conv12) at (2.5,-10) {\small$\text{conv}_{16, 32, 3}$\,+\,$\text{bn}$ + $\ReLU$};
    \node[conv,rotate=90,minimum width=4.5cm] (conv13) at (3.75,-10) {\small$\text{conv}_{32, 64, 3}$\,+\,$\text{bn}$\,+\,$\ReLU$};
    \node[conv,rotate=90,minimum width=4.5cm] (conv14) at (5,-10) {\small$\text{conv}_{64, 128, 3}$\,+\,$\text{bn}$\,+\,$\ReLU$};
    \node[conv,rotate=90,minimum width=4.5cm] (conv15) at (6.25,-10) {\small$\text{conv}_{128, 64, 3}$\,+\,$\text{bn}$\,+\,$\ReLU$};
    \node[conv,rotate=90,minimum width=4.5cm] (conv16) at (7.5,-10) {\small$\text{conv}_{64, 32, 3}$\,+\,$\text{bn}$\,+\,$\ReLU$};
    \node[conv,rotate=90,minimum width=4.5cm] (conv17) at (8.74,-10) {\small$\text{conv}_{32, 16, 3}$\,+\,$\text{bn}$\,+\,$\ReLU$};
    \node[conv,rotate=90,minimum width=4.5cm] (conv18) at (10,-10) {\small$\text{conv}_{16, 1, 3}$\,+\,$\text{bn}$\,+\,$\sigma$};

    \node (rx) at (15, -10) {\small$\tilde{x}$};

    \draw[->] (y) -- (conv1);
    \draw[->] (conv1) -- (pool1);
    \draw[->] (pool1) -- (conv2);

    \draw[->] (conv2) -- (pool2);
    \draw[->] (pool2) -- (conv3);

    \draw[->] (conv3) -- (pool3);
    \draw[->] (pool3) -- (conv4);

    \draw[->] (conv4) -- (pool4);
    \draw[->] (pool4) -- (view4);

    \draw[->] (view4) -- (fc51);
    \draw[->] (view4) -- (fc52);

    \draw[->] (fc51) -- (kld);
    \draw[->] (fc52) -- (kld);

    \draw[->] (kld) -- (z);
    \draw[->] (z) -- (fc6);
    \draw[->] (fc6) -- (view6);

    \draw[->] (view6) -- (up7);
    \draw[->] (up7) -- (conv7);

    \draw[->] (conv7) -- (up8);
    \draw[->] (up8) -- (conv8);

    \draw[->] (conv8) -- (up9);
    \draw[->] (up9) -- (conv9);

    \draw[->] (conv9) -- (up10);
    \draw[->] (up10) -- (conv10);

    \draw[->] (conv10) -- (kld2);
    \draw[->] (kld2) -- (ry);
    \draw[-] (ry) -- (0, -10);
    \draw[->] (0, -10) -- (conv11);

    \draw[->] (conv11) -- (conv12);
    \draw[->] (conv12) -- (conv13);

    \draw[->] (conv13) -- (conv14);
    \draw[->] (conv14) -- (conv15);

    \draw[->] (conv15) -- (conv16);
    \draw[->] (conv16) -- (conv17);

    \draw[->] (conv17) -- (conv18);
    \draw[->] (conv18) -- (rx);

    \node[rotate=90] (L2) at (15, -11.5) {\small$\mathcal{L}(\tilde{x}, x)$};
    \draw[-,dashed] (rx) -- (L2);

    \node[rotate=90] (L1) at (0, 1.5) {\small$\mathcal{L}(\tilde{x}, x)$};
    \draw[-,dashed] (x) -- (L1);

    \node[rotate=90] (KLD2) at (0, -2.5) {\small$\KL(p(y|x)|p(y|z))$};
    \draw[-,dashed] (ry) -- (KLD2);

    \node[rotate=90] (KLD1) at (15, -7.5) {\small$\KL(q(z|y)|p(z))$};
    \draw[-,dashed] (z) -- (KLD1);
  \end{tikzpicture}
  \vskip 0.5cm
  \caption{Illustration of the extended variational auto-encoder as
  also.
  It is worth comparing the illustrated implementation with Figure
  \ref{subfig:experiments-2d-architecture-vae} to understand that the
  generative model $p(y | z)$ are taken from the shape prior and the new
  recognition model $q(z | x)$ follows the architecture of $q(y| z)$.
  In particular, the decoder, \ie $p(y|z)$, is kept fixed after learning the shape prior.
  The only completely new part is the convolutional neural network implementing the
  observation model $p(x|y)$. It consists of
  of eight convolutional stages including batch normalization and non-linearity.
  Here, no pooling layers can be found as the size of the output $x$
  matches the size of its input $y$. We additionally make the loss $\mathcal{L}(\tilde{x}, x)$
  between reconstructed observation $\tilde{x}$ and original observation $x$
  as well as the Kullback-Leibler divergences $\KL(q(z|y)|p(z))$, for the prior $p(z)$,
  and $\KL(p(y|x)|p(y|z))$, tying observations to predicted shapes, explicit.}
  \label{fig:shape-inference-evae}
\end{figure}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\clearpage

\begin{tikzpicture}
    % ------- style -------
    \tikzset{%
        parenthesized/.style={%
            left delimiter  = (,
            right delimiter = ),
        },
        node distance = 10mu,
    }

    % ------- equation -------
    \matrix[matrix of math nodes, parenthesized] (I) {
        0 & 1 & 1 & 1 & 0 & 0 & 0 \\
        0 & 0 & 1 & 1 & 1 & 0 & 0 \\
        0 & 0 & 0 & 1 & 1 & 1 & 0 \\
        0 & 0 & 0 & 1 & 1 & 0 & 0 \\
        0 & 0 & 1 & 1 & 0 & 0 & 0 \\
        0 & 1 & 1 & 0 & 0 & 0 & 0 \\
        1 & 1 & 0 & 0 & 0 & 0 & 0 \\
    };

    \node (*) [right = of I] {${}*{}$};

    \newcommand\Kmatrix{}
    \foreach \row in {1, ..., 3} {
        \gdef \sep {}
        \foreach \col in {1, ..., 3} {%
            \xdef \Kmatrix {\unexpanded\expandafter{\Kmatrix}\unexpanded\expandafter{\sep}\noexpand \K{\row}{\col}}
            \gdef \sep { \& }
        }
        \xdef \Kmatrix {\unexpanded\expandafter{\Kmatrix}\noexpand\\}
    }
    \matrix[matrix of math nodes, parenthesized, ampersand replacement=\&] (K) [right = of *] {
        \Kmatrix
    };

    \node (=) [right = of K] {${}={}$};

    \matrix[matrix of math nodes, parenthesized] (I*K) [right = of {=}] {
        1 & 4 & 3 & 4 & 1 \\
        1 & 2 & 4 & 3 & 3 \\
        1 & 2 & 3 & 4 & 1 \\
        1 & 3 & 3 & 1 & 1 \\
        3 & 3 & 1 & 1 & 0 \\
    };

    % ------- highlighting -------
    \newcommand\rowResult{1}
    \newcommand\colResult{4}

    \begin{scope}[on background layer]
        \newcommand{\padding}{2pt}
        \coordinate (Is-nw) at ([xshift=-\padding, yshift=+\padding] I-\rowResult-\colResult.north west);
        \coordinate (Is-se) at ([xshift=+\padding, yshift=-\padding] I-\the\numexpr\rowResult+\numRowsK-1\relax-\the\numexpr\colResult+\numColsK-1\relax.south east);
        \coordinate (Is-sw) at (Is-nw |- Is-se);
        \coordinate (Is-ne) at (Is-se |- Is-nw);

        \filldraw[red,   fill opacity=.1] (Is-nw) rectangle (Is-se);
        \filldraw[green, fill opacity=.1] (I*K-\rowResult-\colResult.north west) rectangle (I*K-\rowResult-\colResult.south east);

        \draw[blue, dotted]
            (Is-nw) -- (K.north west)
            (Is-se) -- (K.south east)
            (Is-sw) -- (K.south west)
            (Is-ne) -- (K.north east)
        ;
        \draw[green, dotted]
            (I*K-\rowResult-\colResult.north west) -- (K.north west)
            (I*K-\rowResult-\colResult.south east) -- (K.south east)
            (I*K-\rowResult-\colResult.south west) -- (K.south west)
            (I*K-\rowResult-\colResult.north east) -- (K.north east)
        ;

        \draw[blue,  fill=blue!10!white] (K.north west) rectangle (K.south east);

        \foreach \row [evaluate=\row as \rowI using int(\row+\rowResult-1)] in {1, ..., \numRowsK} {%
            \foreach \col [evaluate=\col as \colI using int(\col+\colResult-1)] in {1, ..., \numColsK} {%
                    \node[text=blue] at (I-\rowI-\colI.south east) [xshift=-.3em] {\tiny$\times \K{\row}{\col}$};
                }
        }
    \end{scope}

    % ------- labels -------
    \tikzset{node distance=0em}
    \node[below=of I] (I-label) {$I$};
    \node at (K |- I-label)     {$K$};
    \node at (I*K |- I-label)   {$I*K$};
\end{tikzpicture}%


\begin{tikzpicture}

	\matrix (mtr) [matrix of nodes,row sep=-\pgflinewidth, nodes={draw}]
	{
		0 & 1 & 1 & |[fill=red!30]| 1 & |[fill=red!30]| 0 & |[fill=red!30]| 0 & 0\\
		0 & 0 & 1 & |[fill=red!30]| 1 & |[fill=red!30]| 1 & |[fill=red!30]| 0 & 0\\
		0 & 0 & 0 & |[fill=red!30]| 1 & |[fill=red!30]| 1 & |[fill=red!30]| 1 & 0\\
		0 & 0 & 0 & 1 & 1 & 0 & 0\\
		0 & 0 & 1 & 1 & 0 & 0 & 0\\
		0 & 1 & 1 & 0 & 0 & 0 & 0\\
		1 & 1 & 0 & 0 & 0 & 0 & 0\\
	};

	\draw[very thick, red] (mtr-1-4.north west) rectangle (mtr-3-6.south east);

	\node [below= of mtr-5-4.south] (lm) {$\bf I$};

	\node[right = 0.2em of mtr] (str) {$*$};

	\matrix (K) [right=0.2em of str,matrix of nodes,row sep=-\pgflinewidth, nodes={draw, fill=blue!30}]
	{
		1 & 0 & 1 \\
		0 & 1 & 0 \\
		1 & 0 & 1 \\
	};
	\node [below = of K-3-2.south] (lk) {$\bf K$};

	\node [right = 0.2em of K] (eq) {$=$};

	\matrix (ret) [right=0.2em of eq,matrix of nodes,row sep=-\pgflinewidth, nodes={draw}]
	{
		1 & 4 & 3 & |[fill=green!30]| 4 & 1\\
		1 & 2 & 4 & 3 & 3\\
		1 & 2 & 3 & 4 & 1\\
		1 & 3 & 3 & 1 & 1\\
		3 & 3 & 1 & 1 & 0\\
	};
	\node [below = of ret-4-3.south] (lim) {${\bf I} * {\bf K}$};

	\draw[very thick, green] (ret-1-4.north west) rectangle (ret-1-4.south east);

	\draw[densely dotted, blue, thick] (mtr-1-4.north west) -- (K-1-1.north west);
	\draw[densely dotted, blue, thick] (mtr-3-4.south west) -- (K-3-1.south west);
	\draw[densely dotted, blue, thick] (mtr-1-6.north east) -- (K-1-3.north east);
	\draw[densely dotted, blue, thick] (mtr-3-6.south east) -- (K-3-3.south east);

	\draw[densely dotted, green, thick] (ret-1-4.north west) -- (K-1-1.north west);
	\draw[densely dotted, green, thick] (ret-1-4.south west) -- (K-3-1.south west);
	\draw[densely dotted, green, thick] (ret-1-4.north east) -- (K-1-3.north east);
	\draw[densely dotted, green, thick] (ret-1-4.south east) -- (K-3-3.south east);

	\matrix (K) [right=0.2em of str,matrix of nodes,row sep=-\pgflinewidth, nodes={draw, fill=blue!10}]
	{
		1 & 0 & 1 \\
		0 & 1 & 0 \\
		1 & 0 & 1 \\
	};

	\draw[very thick, blue] (K-1-1.north west) rectangle (K-3-3.south east);

	\node[anchor=south east, inner sep=0.01em, blue] at (mtr-1-4.south east) (xx) {\scalebox{.5}{$\times 1$}};
	\node[anchor=south east, inner sep=0.01em, blue] at (mtr-1-5.south east) (xx) {\scalebox{.5}{$\times 0$}};
	\node[anchor=south east, inner sep=0.01em, blue] at (mtr-1-6.south east) (xx) {\scalebox{.5}{$\times 1$}};
	\node[anchor=south east, inner sep=0.01em, blue] at (mtr-2-4.south east) (xx) {\scalebox{.5}{$\times 0$}};
	\node[anchor=south east, inner sep=0.01em, blue] at (mtr-2-5.south east) (xx) {\scalebox{.5}{$\times 1$}};
	\node[anchor=south east, inner sep=0.01em, blue] at (mtr-2-6.south east) (xx) {\scalebox{.5}{$\times 0$}};
	\node[anchor=south east, inner sep=0.01em, blue] at (mtr-3-4.south east) (xx) {\scalebox{.5}{$\times 1$}};
	\node[anchor=south east, inner sep=0.01em, blue] at (mtr-3-5.south east) (xx) {\scalebox{.5}{$\times 0$}};
	\node[anchor=south east, inner sep=0.01em, blue] at (mtr-3-6.south east) (xx) {\scalebox{.5}{$\times 1$}};

\end{tikzpicture}


\begin{align*}
x\tikzmark{x}(t)*h\tikzmark{h}(t) &= y\tikzmark{y}(t) \\[2em]
X(f) \, H(f) &= Y(f)
\end{align*}

\begin{tikzpicture}[overlay,remember picture, > = {Circle[open,blue]}]
  \draw [<->] ([yshift=-.7ex]pic cs:x) -- ++(0,-2.2em);
  \draw [<->] ([yshift=-.7ex]pic cs:h) -- ++(0,-2.2em);
  \draw [<->] ([yshift=-.7ex]pic cs:y) -- ++(0,-2.2em);
\end{tikzpicture}


\end{document}